From 7c2080321b7e0c010734c41550a885064226751d Mon Sep 17 00:00:00 2001 From: xah30 Date: Fri, 29 May 2026 17:22:10 +0300 Subject: [PATCH] feat(cli,tunnel): v3.4 client consumes manifest endpoints + fix #45 silent client exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-ups to the previous v3.4 commit (ba8d6b7): ## #49 — client uses BridgeEndpoint ports as authoritative BridgesDiscoveryWatcher now keeps a second snapshot (`Arc>>`) for the per-transport endpoints carried by v3.4 manifests, alongside the existing flat-bridges snapshot for v3.3 compatibility. `endpoints_snapshot()` and `primary_endpoint()` expose it to the client. In `client::run`, immediately after the watcher loads, the primary endpoint's per-transport ports override the dial-time `dial_cfg.endpoints.{tcp,quic,udp}` *ports*. The IP stays whatever the dialer already resolved (server_addr / bridge list). This is what closes the loop on the user's friend's setup: the server picks 8444 because sing-box has 443/8443, signs a manifest with `endpoints = [{tcp: 8444, ...}]`, the client loads it on next refresh and starts dialing the right port without an operator-side `client.toml` edit. When the manifest has no `endpoints` field (old v3.3 format, or operator chose not to publish per-transport ports), no override is applied and the client.toml `[transport] *_port` values are used as before. ## #45 — silent client exit on broken connection Root cause confirmed in `AuraRouter::run`: - the inbound task did `let pkt = inbound_conn.recv_packet().await?;`, so any recv error returned silently via `?` - the `to_tun_tx` channel sender dropped, `to_tun_rx.recv()` returned `None` - the outbound `select!` arm matched `None => break Ok(())` - the router returned `Ok(())`, the client's `run()` returned `Ok(())`, the process exited 0 with no log, no error message We saw this empirically when the user disabled a co-resident VPN that had been routing AuraVPN's UDP/444 traffic — the underlying QUIC socket broke, the inbound task hit recv error, and the whole client vanished. Fix: - Inbound task now logs the error at `error` level with the underlying `recv_packet` cause before exiting. - The outbound `select!`'s `None` arm now returns an Err (not Ok(())) so the caller knows the tunnel died and `aura client` exits non-zero — which is what a supervisor (systemd, launchd, or a future auto-redial loop) wants to see. - The router waits up to 200ms for the inbound task to land cleanly before returning, so its error / panic is logged instead of being swallowed by `abort()`. Existing tests still pass (12/12 in aura-tunnel router tests). Tested manually: with the fix, killing the underlying transport now produces a "peer connection broke (recv_packet failed): …" error line and a non-zero exit, instead of silent process disappearance. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.7 --- crates/aura-cli/src/bridges.rs | 26 ++++++++++++++++++++++ crates/aura-cli/src/client.rs | 30 +++++++++++++++++++++++++ crates/aura-tunnel/src/router.rs | 38 ++++++++++++++++++++++++++++---- 3 files changed, 90 insertions(+), 4 deletions(-) diff --git a/crates/aura-cli/src/bridges.rs b/crates/aura-cli/src/bridges.rs index 3bb9e19..d9fc254 100644 --- a/crates/aura-cli/src/bridges.rs +++ b/crates/aura-cli/src/bridges.rs @@ -333,6 +333,10 @@ impl BridgeManifest { pub struct BridgesDiscoveryWatcher { /// The current effective merged list (static + manifest, de-duplicated by `SocketAddr`). snapshot: Arc>>, + /// v3.4: the per-transport endpoints carried by the most-recently-loaded manifest. Empty + /// when the manifest has no `endpoints` field (v3.3-format manifest, or v3.4 manifest where + /// the operator opted not to publish per-transport ports). + endpoints_snapshot: Arc>>, /// The static list from `[client] bridges` (used as a fallback when the manifest is missing). static_bridges: Vec, /// File path of the signed manifest. @@ -355,8 +359,10 @@ impl BridgesDiscoveryWatcher { static_bridges: Vec, ) -> Self { let snapshot = Arc::new(RwLock::new(static_bridges.clone())); + let endpoints_snapshot = Arc::new(RwLock::new(Vec::new())); let watcher = Self { snapshot, + endpoints_snapshot, static_bridges, manifest_path, ca_cert_pem, @@ -366,6 +372,20 @@ impl BridgesDiscoveryWatcher { watcher } + /// v3.4: clone of the per-transport endpoint snapshot. Empty when the manifest has no + /// `endpoints` field. The dialer's [`Endpoints`](aura_transport::Endpoints) port overrides + /// should be derived from this — see [`Self::primary_endpoint`]. + pub async fn endpoints_snapshot(&self) -> Vec { + self.endpoints_snapshot.read().await.clone() + } + + /// v3.4: first endpoint from the snapshot, when present. Useful for the common case of a + /// single-server deployment where the watcher mainly mirrors the primary server's chosen + /// ports. + pub async fn primary_endpoint(&self) -> Option { + self.endpoints_snapshot.read().await.first().cloned() + } + /// Snapshot handle: clones of this `Arc>` can be read concurrently by the dial loop. pub fn handle(&self) -> Arc>> { Arc::clone(&self.snapshot) @@ -386,11 +406,17 @@ impl BridgesDiscoveryWatcher { let merged = merged_snapshot(&self.static_bridges, &manifest.parsed_bridges()); let merged_len = merged.len(); *self.snapshot.write().await = merged; + // v3.4: copy the per-transport endpoints over too. They drive dial-time port + // overrides on the client (see [`crate::client::run`]). Old v3.3 manifests have + // an empty `endpoints` field and the snapshot just clears. + let endpoints_len = manifest.endpoints.len(); + *self.endpoints_snapshot.write().await = manifest.endpoints.clone(); tracing::info!( path = %self.manifest_path.display(), generated_at = manifest.generated_at, expires_at = manifest.expires_at, manifest_bridges = manifest.bridges.len(), + manifest_endpoints = endpoints_len, merged_total = merged_len, "loaded signed bridges manifest" ); diff --git a/crates/aura-cli/src/client.rs b/crates/aura-cli/src/client.rs index 27cde03..ac97a26 100644 --- a/crates/aura-cli/src/client.rs +++ b/crates/aura-cli/src/client.rs @@ -131,6 +131,36 @@ pub async fn run(config_path: &Path, admin_socket: &str) -> anyhow::Result<()> { // returned JoinHandle. Dropping the watcher returned by `new` would also be fine — // the handle keeps a clone of the Arc and outlives the local binding. let _bg = watcher.spawn_refresh(); + // v3.4: when the manifest carries per-transport endpoints, override the dial-time + // *_port for each transport with the operator's published value. This is what lets a + // server that had to port-scan past a busy 8443 (sing-box / Hysteria2 on the same host) + // tell its clients to use 8444 instead — the client.toml's static [transport] ports + // become only the bootstrap fallback. We deliberately override only the *port*: the IP + // stays whatever the dialer already resolved (server_addr / bridge list), because the + // bridges manifest is authoritative for ports but not for which host the client is + // currently talking to. + if let Some(ep) = watcher.primary_endpoint().await { + let mut applied = Vec::new(); + if let (Some(port), Some(addr)) = (ep.tcp, dial_cfg.endpoints.tcp) { + dial_cfg.endpoints.tcp = Some(std::net::SocketAddr::new(addr.ip(), port)); + applied.push(format!("tcp={}", port)); + } + if let (Some(port), Some(addr)) = (ep.quic, dial_cfg.endpoints.quic) { + dial_cfg.endpoints.quic = Some(std::net::SocketAddr::new(addr.ip(), port)); + applied.push(format!("quic={}", port)); + } + if let (Some(port), Some(addr)) = (ep.udp, dial_cfg.endpoints.udp) { + dial_cfg.endpoints.udp = Some(std::net::SocketAddr::new(addr.ip(), port)); + applied.push(format!("udp={}", port)); + } + if !applied.is_empty() { + tracing::info!( + endpoint_host = %ep.host, + overrides = %applied.join(","), + "v3.4 manifest endpoints override dial-time transport ports" + ); + } + } tracing::info!( path = %manifest_path.display(), refresh_interval_secs = refresh_secs, diff --git a/crates/aura-tunnel/src/router.rs b/crates/aura-tunnel/src/router.rs index 2c9e965..ac08f3e 100644 --- a/crates/aura-tunnel/src/router.rs +++ b/crates/aura-tunnel/src/router.rs @@ -144,7 +144,22 @@ impl AuraRouter

{ let inbound_conn = Arc::clone(&self.conn); let inbound = tokio::spawn(async move { loop { - let pkt = inbound_conn.recv_packet().await?; + let pkt = match inbound_conn.recv_packet().await { + Ok(p) => p, + Err(e) => { + // v3.4 fix for #45 (silent client exit): the inbound task used to swallow + // this error and ride out via `?`, so when the underlying transport broke + // (e.g. a co-resident VPN's UDP socket got remapped) the outbound select! + // saw a clean `None` and returned `Ok(())`. No log, no exit message, no + // reconnect hint. Now we log loudly with the real cause before propagating. + let err_str = e.to_string(); + tracing::error!( + error = %err_str, + "peer connection broke (recv_packet failed); client is exiting" + ); + return Err(anyhow::anyhow!("recv_packet from peer failed: {err_str}")); + } + }; if to_tun_tx.send(pkt).await.is_err() { // TUN owner loop has stopped; nothing more to do. break; @@ -177,14 +192,29 @@ impl AuraRouter

{ c.inc_rx(); } } - // Inbound task ended (connection closed/errored). - None => break Ok(()), + // Inbound task ended. Either gracefully (we drove `to_tun_tx` drop via the + // outbound side exiting first — unreachable here since we'd still be inside + // the select), or because the peer connection broke. v3.4: surface as an + // error so `aura client` exits non-zero and a supervisor (systemd, launchd, + // a future auto-redial loop) knows the tunnel died. The inbound task itself + // already logged the underlying cause at error level. + None => break Err(anyhow::anyhow!( + "peer connection closed; router shutting down (see preceding error log for cause)" + )), } } } }; - inbound.abort(); + // Wait for the inbound task to land so we can surface its error rather than just abort() + // it (which would silently drop the underlying cause). Bounded by a short timeout so a + // stuck inbound future cannot wedge shutdown. + match tokio::time::timeout(std::time::Duration::from_millis(200), inbound).await { + Ok(Ok(Ok(()))) => {} + Ok(Ok(Err(e))) => tracing::warn!(error = %e, "inbound task exited with error"), + Ok(Err(join_err)) => tracing::warn!(error = %join_err, "inbound task panicked"), + Err(_) => tracing::warn!("inbound task did not exit within 200ms; abandoning"), + } result }