zebra_network/peer/error.rs
1//! Peer-related errors.
2
3use std::{borrow::Cow, sync::Arc};
4
5use thiserror::Error;
6
7use tracing_error::TracedError;
8use zebra_chain::serialization::SerializationError;
9
10use crate::protocol::external::InventoryHash;
11
12/// A wrapper around `Arc<PeerError>` that implements `Error`.
13#[derive(Error, Debug, Clone)]
14#[error(transparent)]
15pub struct SharedPeerError(Arc<TracedError<PeerError>>);
16
17impl<E> From<E> for SharedPeerError
18where
19 PeerError: From<E>,
20{
21 fn from(source: E) -> Self {
22 Self(Arc::new(TracedError::from(PeerError::from(source))))
23 }
24}
25
26impl SharedPeerError {
27 /// Returns a debug-formatted string describing the inner [`PeerError`].
28 ///
29 /// Unfortunately, [`TracedError`] makes it impossible to get a reference to the original error.
30 pub fn inner_debug(&self) -> String {
31 format!("{:?}", self.0.as_ref())
32 }
33}
34
35/// An error related to peer connection handling.
36#[derive(Error, Debug)]
37#[allow(dead_code)]
38pub enum PeerError {
39 /// The remote peer closed the connection.
40 #[error("Peer closed connection")]
41 ConnectionClosed,
42
43 /// Zebra dropped the [`Connection`](crate::peer::Connection).
44 #[error("Internal connection dropped")]
45 ConnectionDropped,
46
47 /// Zebra dropped the [`Client`](crate::peer::Client).
48 #[error("Internal client dropped")]
49 ClientDropped,
50
51 /// A [`Client`](crate::peer::Client)'s internal connection task exited.
52 #[error("Internal peer connection task exited")]
53 ConnectionTaskExited,
54
55 /// Zebra's [`Client`](crate::peer::Client) cancelled its heartbeat task.
56 #[error("Internal client cancelled its heartbeat task")]
57 ClientCancelledHeartbeatTask,
58
59 /// Zebra's internal heartbeat task exited.
60 #[error("Internal heartbeat task exited with message: {0:?}")]
61 HeartbeatTaskExited(String),
62
63 /// Sending a message to a remote peer took too long.
64 #[error("Sending Client request timed out")]
65 ConnectionSendTimeout,
66
67 /// Receiving a response to a [`Client`](crate::peer::Client) request took too long.
68 #[error("Receiving client response timed out")]
69 ConnectionReceiveTimeout,
70
71 /// A serialization error occurred while reading or writing a message.
72 #[error("Serialization error: {0}")]
73 Serialization(#[from] SerializationError),
74
75 /// A badly-behaved remote peer sent a handshake message after the handshake was
76 /// already complete.
77 #[error("Remote peer sent handshake messages after handshake")]
78 DuplicateHandshake,
79
80 /// This node's internal services were overloaded, so the connection was dropped
81 /// to shed load.
82 #[error("Internal services over capacity")]
83 Overloaded,
84
85 /// There are no ready remote peers.
86 #[error("No ready peers available")]
87 NoReadyPeers,
88
89 /// This peer request's caused an internal service timeout, so the connection was dropped
90 /// to shed load or prevent attacks.
91 #[error("Internal services timed out")]
92 InboundTimeout,
93
94 /// This node's internal services are no longer able to service requests.
95 #[error("Internal services have failed or shutdown")]
96 ServiceShutdown,
97
98 /// We requested data, but the peer replied with a `notfound` message.
99 /// (Or it didn't respond before the request finished.)
100 ///
101 /// This error happens when the peer doesn't have any of the requested data,
102 /// so that the original request can be retried.
103 ///
104 /// This is a temporary error.
105 ///
106 /// Zebra can try different peers if the request is retried,
107 /// or peers can download and verify the missing data.
108 ///
109 /// If the peer has some of the data, the request returns an [`Ok`] response,
110 /// with any `notfound` data is marked as [`Missing`][1].
111 ///
112 /// [1]: crate::protocol::internal::InventoryResponse::Missing
113 #[error("Remote peer could not find any of the items: {0:?}")]
114 NotFoundResponse(Vec<InventoryHash>),
115
116 /// We requested data, but all our ready peers are marked as recently
117 /// [`Missing`][1] that data in our local inventory registry.
118 ///
119 /// This is a temporary error.
120 ///
121 /// Peers with the inventory can finish their requests and become ready, or
122 /// other peers can download and verify the missing data.
123 ///
124 /// # Correctness
125 ///
126 /// This error is produced using Zebra's local inventory registry, without
127 /// contacting any peers.
128 ///
129 /// Client responses containing this error must not be used to update the
130 /// inventory registry. This makes sure that we eventually expire our local
131 /// cache of missing inventory, and send requests to peers again.
132 ///
133 /// [1]: crate::protocol::internal::InventoryResponse::Missing
134 #[error("All ready peers are registered as recently missing these items: {0:?}")]
135 NotFoundRegistry(Vec<InventoryHash>),
136}
137
138impl PeerError {
139 /// Returns the Zebra internal handler type as a string.
140 pub fn kind(&self) -> Cow<'static, str> {
141 match self {
142 PeerError::ConnectionClosed => "ConnectionClosed".into(),
143 PeerError::ConnectionDropped => "ConnectionDropped".into(),
144 PeerError::ClientDropped => "ClientDropped".into(),
145 PeerError::ClientCancelledHeartbeatTask => "ClientCancelledHeartbeatTask".into(),
146 PeerError::HeartbeatTaskExited(_) => "HeartbeatTaskExited".into(),
147 PeerError::ConnectionTaskExited => "ConnectionTaskExited".into(),
148 PeerError::ConnectionSendTimeout => "ConnectionSendTimeout".into(),
149 PeerError::ConnectionReceiveTimeout => "ConnectionReceiveTimeout".into(),
150 // TODO: add error kinds or summaries to `SerializationError`
151 PeerError::Serialization(inner) => format!("Serialization({inner})").into(),
152 PeerError::DuplicateHandshake => "DuplicateHandshake".into(),
153 PeerError::Overloaded => "Overloaded".into(),
154 PeerError::NoReadyPeers => "NoReadyPeers".into(),
155 PeerError::InboundTimeout => "InboundTimeout".into(),
156 PeerError::ServiceShutdown => "ServiceShutdown".into(),
157 PeerError::NotFoundResponse(_) => "NotFoundResponse".into(),
158 PeerError::NotFoundRegistry(_) => "NotFoundRegistry".into(),
159 }
160 }
161}
162
163/// A shared error slot for peer errors.
164///
165/// # Correctness
166///
167/// Error slots are shared between sync and async code. In async code, the error
168/// mutex should be held for as short a time as possible. This avoids blocking
169/// the async task thread on acquiring the mutex.
170///
171/// > If the value behind the mutex is just data, it’s usually appropriate to use a blocking mutex
172/// > ...
173/// > wrap the `Arc<Mutex<...>>` in a struct
174/// > that provides non-async methods for performing operations on the data within,
175/// > and only lock the mutex inside these methods
176///
177/// <https://docs.rs/tokio/1.15.0/tokio/sync/struct.Mutex.html#which-kind-of-mutex-should-you-use>
178#[derive(Default, Clone)]
179pub struct ErrorSlot(Arc<std::sync::Mutex<Option<SharedPeerError>>>);
180
181impl std::fmt::Debug for ErrorSlot {
182 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
183 // don't hang if the mutex is locked
184 // show the panic if the mutex was poisoned
185 f.debug_struct("ErrorSlot")
186 .field("error", &self.0.try_lock())
187 .finish()
188 }
189}
190
191impl ErrorSlot {
192 /// Read the current error in the slot.
193 ///
194 /// Returns `None` if there is no error in the slot.
195 ///
196 /// # Correctness
197 ///
198 /// Briefly locks the error slot's threaded `std::sync::Mutex`, to get a
199 /// reference to the error in the slot.
200 #[allow(clippy::unwrap_in_result)]
201 pub fn try_get_error(&self) -> Option<SharedPeerError> {
202 self.0
203 .lock()
204 .expect("error mutex should be unpoisoned")
205 .as_ref()
206 .cloned()
207 }
208
209 /// Update the current error in the slot.
210 ///
211 /// Returns `Err(AlreadyErrored)` if there was already an error in the slot.
212 ///
213 /// # Correctness
214 ///
215 /// Briefly locks the error slot's threaded `std::sync::Mutex`, to check for
216 /// a previous error, then update the error in the slot.
217 #[allow(clippy::unwrap_in_result)]
218 pub fn try_update_error(&self, e: SharedPeerError) -> Result<(), AlreadyErrored> {
219 let mut guard = self.0.lock().expect("error mutex should be unpoisoned");
220
221 if let Some(original_error) = guard.clone() {
222 Err(AlreadyErrored { original_error })
223 } else {
224 *guard = Some(e);
225 Ok(())
226 }
227 }
228}
229
230/// Error returned when the [`ErrorSlot`] already contains an error.
231#[derive(Clone, Debug)]
232pub struct AlreadyErrored {
233 /// The original error in the error slot.
234 pub original_error: SharedPeerError,
235}
236
237/// An error during a handshake with a remote peer.
238#[derive(Error, Debug)]
239pub enum HandshakeError {
240 /// The remote peer sent an unexpected message during the handshake.
241 #[error("The remote peer sent an unexpected message: {0:?}")]
242 UnexpectedMessage(Box<crate::protocol::external::Message>),
243 /// The peer connector detected handshake nonce reuse, possibly indicating self-connection.
244 #[error("Detected nonce reuse, possible self-connection")]
245 RemoteNonceReuse,
246 /// The peer connector created a duplicate random nonce. This is very unlikely,
247 /// because the range of the data type is 2^64.
248 #[error("Unexpectedly created a duplicate random local nonce")]
249 LocalDuplicateNonce,
250 /// The remote peer closed the connection.
251 #[error("Peer closed connection")]
252 ConnectionClosed,
253 /// An error occurred while performing an IO operation.
254 #[error("Underlying IO error: {0}")]
255 Io(#[from] std::io::Error),
256 /// A serialization error occurred while reading or writing a message.
257 #[error("Serialization error: {0}")]
258 Serialization(#[from] SerializationError),
259 /// The remote peer offered a version older than our minimum version.
260 #[error("Peer offered obsolete version: {0:?}")]
261 ObsoleteVersion(crate::protocol::external::types::Version),
262 /// Sending or receiving a message timed out.
263 #[error("Timeout when sending or receiving a message to peer")]
264 Timeout,
265}
266
267impl From<tokio::time::error::Elapsed> for HandshakeError {
268 fn from(_source: tokio::time::error::Elapsed) -> Self {
269 HandshakeError::Timeout
270 }
271}