diff --git a/crates/aura-cli/src/bridges.rs b/crates/aura-cli/src/bridges.rs index 3bb9e19..d9fc254 100644 --- a/crates/aura-cli/src/bridges.rs +++ b/crates/aura-cli/src/bridges.rs @@ -333,6 +333,10 @@ impl BridgeManifest { pub struct BridgesDiscoveryWatcher { /// The current effective merged list (static + manifest, de-duplicated by `SocketAddr`). snapshot: Arc>>, + /// v3.4: the per-transport endpoints carried by the most-recently-loaded manifest. Empty + /// when the manifest has no `endpoints` field (v3.3-format manifest, or v3.4 manifest where + /// the operator opted not to publish per-transport ports). + endpoints_snapshot: Arc>>, /// The static list from `[client] bridges` (used as a fallback when the manifest is missing). static_bridges: Vec, /// File path of the signed manifest. @@ -355,8 +359,10 @@ impl BridgesDiscoveryWatcher { static_bridges: Vec, ) -> Self { let snapshot = Arc::new(RwLock::new(static_bridges.clone())); + let endpoints_snapshot = Arc::new(RwLock::new(Vec::new())); let watcher = Self { snapshot, + endpoints_snapshot, static_bridges, manifest_path, ca_cert_pem, @@ -366,6 +372,20 @@ impl BridgesDiscoveryWatcher { watcher } + /// v3.4: clone of the per-transport endpoint snapshot. Empty when the manifest has no + /// `endpoints` field. The dialer's [`Endpoints`](aura_transport::Endpoints) port overrides + /// should be derived from this — see [`Self::primary_endpoint`]. + pub async fn endpoints_snapshot(&self) -> Vec { + self.endpoints_snapshot.read().await.clone() + } + + /// v3.4: first endpoint from the snapshot, when present. Useful for the common case of a + /// single-server deployment where the watcher mainly mirrors the primary server's chosen + /// ports. + pub async fn primary_endpoint(&self) -> Option { + self.endpoints_snapshot.read().await.first().cloned() + } + /// Snapshot handle: clones of this `Arc>` can be read concurrently by the dial loop. pub fn handle(&self) -> Arc>> { Arc::clone(&self.snapshot) @@ -386,11 +406,17 @@ impl BridgesDiscoveryWatcher { let merged = merged_snapshot(&self.static_bridges, &manifest.parsed_bridges()); let merged_len = merged.len(); *self.snapshot.write().await = merged; + // v3.4: copy the per-transport endpoints over too. They drive dial-time port + // overrides on the client (see [`crate::client::run`]). Old v3.3 manifests have + // an empty `endpoints` field and the snapshot just clears. + let endpoints_len = manifest.endpoints.len(); + *self.endpoints_snapshot.write().await = manifest.endpoints.clone(); tracing::info!( path = %self.manifest_path.display(), generated_at = manifest.generated_at, expires_at = manifest.expires_at, manifest_bridges = manifest.bridges.len(), + manifest_endpoints = endpoints_len, merged_total = merged_len, "loaded signed bridges manifest" ); diff --git a/crates/aura-cli/src/client.rs b/crates/aura-cli/src/client.rs index 27cde03..ac97a26 100644 --- a/crates/aura-cli/src/client.rs +++ b/crates/aura-cli/src/client.rs @@ -131,6 +131,36 @@ pub async fn run(config_path: &Path, admin_socket: &str) -> anyhow::Result<()> { // returned JoinHandle. Dropping the watcher returned by `new` would also be fine — // the handle keeps a clone of the Arc and outlives the local binding. let _bg = watcher.spawn_refresh(); + // v3.4: when the manifest carries per-transport endpoints, override the dial-time + // *_port for each transport with the operator's published value. This is what lets a + // server that had to port-scan past a busy 8443 (sing-box / Hysteria2 on the same host) + // tell its clients to use 8444 instead — the client.toml's static [transport] ports + // become only the bootstrap fallback. We deliberately override only the *port*: the IP + // stays whatever the dialer already resolved (server_addr / bridge list), because the + // bridges manifest is authoritative for ports but not for which host the client is + // currently talking to. + if let Some(ep) = watcher.primary_endpoint().await { + let mut applied = Vec::new(); + if let (Some(port), Some(addr)) = (ep.tcp, dial_cfg.endpoints.tcp) { + dial_cfg.endpoints.tcp = Some(std::net::SocketAddr::new(addr.ip(), port)); + applied.push(format!("tcp={}", port)); + } + if let (Some(port), Some(addr)) = (ep.quic, dial_cfg.endpoints.quic) { + dial_cfg.endpoints.quic = Some(std::net::SocketAddr::new(addr.ip(), port)); + applied.push(format!("quic={}", port)); + } + if let (Some(port), Some(addr)) = (ep.udp, dial_cfg.endpoints.udp) { + dial_cfg.endpoints.udp = Some(std::net::SocketAddr::new(addr.ip(), port)); + applied.push(format!("udp={}", port)); + } + if !applied.is_empty() { + tracing::info!( + endpoint_host = %ep.host, + overrides = %applied.join(","), + "v3.4 manifest endpoints override dial-time transport ports" + ); + } + } tracing::info!( path = %manifest_path.display(), refresh_interval_secs = refresh_secs, diff --git a/crates/aura-tunnel/src/router.rs b/crates/aura-tunnel/src/router.rs index 2c9e965..ac08f3e 100644 --- a/crates/aura-tunnel/src/router.rs +++ b/crates/aura-tunnel/src/router.rs @@ -144,7 +144,22 @@ impl AuraRouter

{ let inbound_conn = Arc::clone(&self.conn); let inbound = tokio::spawn(async move { loop { - let pkt = inbound_conn.recv_packet().await?; + let pkt = match inbound_conn.recv_packet().await { + Ok(p) => p, + Err(e) => { + // v3.4 fix for #45 (silent client exit): the inbound task used to swallow + // this error and ride out via `?`, so when the underlying transport broke + // (e.g. a co-resident VPN's UDP socket got remapped) the outbound select! + // saw a clean `None` and returned `Ok(())`. No log, no exit message, no + // reconnect hint. Now we log loudly with the real cause before propagating. + let err_str = e.to_string(); + tracing::error!( + error = %err_str, + "peer connection broke (recv_packet failed); client is exiting" + ); + return Err(anyhow::anyhow!("recv_packet from peer failed: {err_str}")); + } + }; if to_tun_tx.send(pkt).await.is_err() { // TUN owner loop has stopped; nothing more to do. break; @@ -177,14 +192,29 @@ impl AuraRouter

{ c.inc_rx(); } } - // Inbound task ended (connection closed/errored). - None => break Ok(()), + // Inbound task ended. Either gracefully (we drove `to_tun_tx` drop via the + // outbound side exiting first — unreachable here since we'd still be inside + // the select), or because the peer connection broke. v3.4: surface as an + // error so `aura client` exits non-zero and a supervisor (systemd, launchd, + // a future auto-redial loop) knows the tunnel died. The inbound task itself + // already logged the underlying cause at error level. + None => break Err(anyhow::anyhow!( + "peer connection closed; router shutting down (see preceding error log for cause)" + )), } } } }; - inbound.abort(); + // Wait for the inbound task to land so we can surface its error rather than just abort() + // it (which would silently drop the underlying cause). Bounded by a short timeout so a + // stuck inbound future cannot wedge shutdown. + match tokio::time::timeout(std::time::Duration::from_millis(200), inbound).await { + Ok(Ok(Ok(()))) => {} + Ok(Ok(Err(e))) => tracing::warn!(error = %e, "inbound task exited with error"), + Ok(Err(join_err)) => tracing::warn!(error = %join_err, "inbound task panicked"), + Err(_) => tracing::warn!("inbound task did not exit within 200ms; abandoning"), + } result }