Skip to content

Commit

Permalink
Upload crashes to collab directly (zed-industries#8649)
Browse files Browse the repository at this point in the history
This lets us run rustc_demangle on the backtrace, which helps the Slack
view significantly.

We're also now uploading files to digital ocean's S3 equivalent (with a
1 month expiry) instead of to Slack.

This PR paves the way for (but does not yet implement) sending this data
to clickhouse too.

Release Notes:

- N/A
  • Loading branch information
ConradIrwin authored Mar 1, 2024
1 parent cdf702a commit 64460e4
Show file tree
Hide file tree
Showing 13 changed files with 657 additions and 27 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ tree-sitter-gomod = { git = "https://github.com/camdencheek/tree-sitter-go-mod"
tree-sitter-gowork = { git = "https://github.com/d1y/tree-sitter-go-work" }
tree-sitter-haskell = { git = "https://github.com/tree-sitter/tree-sitter-haskell", rev = "8a99848fc734f9c4ea523b3f2a07df133cbbcec2" }
tree-sitter-hcl = { git = "https://github.com/MichaHoffmann/tree-sitter-hcl", rev = "v1.1.0" }
rustc-demangle = "0.1.23"
tree-sitter-heex = { git = "https://github.com/phoenixframework/tree-sitter-heex", rev = "2e1348c3cf2c9323e87c2744796cf3f3868aa82a" }
tree-sitter-html = "0.19.0"
tree-sitter-json = { git = "https://github.com/tree-sitter/tree-sitter-json", rev = "40a81c01a40ac48744e0c8ccabbaba1920441199" }
Expand Down
2 changes: 2 additions & 0 deletions crates/collab/.env.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,7 @@ ZED_CLIENT_CHECKSUM_SEED = "development-checksum-seed"
# CLICKHOUSE_PASSWORD = ""
# CLICKHOUSE_DATABASE = "default"

# SLACK_PANICS_WEBHOOK = ""

# RUST_LOG=info
# LOG_JSON=true
7 changes: 2 additions & 5 deletions crates/collab/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,11 @@ version = "0.44.0"
publish = false
license = "AGPL-3.0-or-later"

[features]
seed-support = ["reqwest"]

[[bin]]
name = "collab"

[[bin]]
name = "seed"
required-features = ["seed-support"]

[dependencies]
anyhow.workspace = true
Expand All @@ -40,7 +36,7 @@ parking_lot.workspace = true
prometheus = "0.13"
prost.workspace = true
rand.workspace = true
reqwest = { version = "0.11", features = ["json"], optional = true }
reqwest = { version = "0.11", features = ["json"] }
rpc.workspace = true
scrypt = "0.7"
sea-orm = { version = "0.12.x", features = ["sqlx-postgres", "postgres-array", "runtime-tokio-rustls", "with-uuid"] }
Expand All @@ -50,6 +46,7 @@ serde_derive.workspace = true
serde_json.workspace = true
sha2.workspace = true
sqlx = { version = "0.7", features = ["runtime-tokio-rustls", "postgres", "json", "time", "uuid", "any"] }
rustc-demangle.workspace = true
telemetry_events.workspace = true
text.workspace = true
time.workspace = true
Expand Down
5 changes: 5 additions & 0 deletions crates/collab/k8s/collab.template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ spec:
secretKeyRef:
name: clickhouse
key: database
- name: SLACK_PANICS_WEBHOOK
valueFrom:
secretKeyRef:
name: slack
key: panics_webhook
- name: INVITE_LINK_PREFIX
value: ${INVITE_LINK_PREFIX}
- name: RUST_BACKTRACE
Expand Down
18 changes: 2 additions & 16 deletions crates/collab/src/api.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
pub mod events;
pub mod extensions;
pub mod ips_file;
pub mod slack;

use crate::{
auth,
Expand All @@ -21,15 +23,13 @@ use chrono::SecondsFormat;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use tower::ServiceBuilder;
use tracing::instrument;

pub use extensions::fetch_extensions_from_blob_store_periodically;

pub fn routes(rpc_server: Option<Arc<rpc::Server>>, state: Arc<AppState>) -> Router<Body> {
Router::new()
.route("/user", get(get_authenticated_user))
.route("/users/:id/access_tokens", post(create_access_token))
.route("/panic", post(trace_panic))
.route("/rpc_server_snapshot", get(get_rpc_server_snapshot))
.route("/contributors", get(get_contributors).post(add_contributor))
.route("/contributor", get(check_is_contributor))
Expand Down Expand Up @@ -120,20 +120,6 @@ struct CreateUserResponse {
metrics_id: String,
}

#[derive(Debug, Deserialize)]
struct Panic {
version: String,
release_channel: String,
backtrace_hash: String,
text: String,
}

#[instrument(skip(panic))]
async fn trace_panic(panic: Json<Panic>) -> Result<()> {
tracing::error!(version = %panic.version, release_channel = %panic.release_channel, backtrace_hash = %panic.backtrace_hash, text = %panic.text, "panic report");
Ok(())
}

async fn get_rpc_server_snapshot(
Extension(rpc_server): Extension<Option<Arc<rpc::Server>>>,
) -> Result<ErasedJson> {
Expand Down
148 changes: 144 additions & 4 deletions crates/collab/src/api/events.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
use std::sync::{Arc, OnceLock};

use anyhow::{anyhow, Context};
use aws_sdk_s3::primitives::ByteStream;
use axum::{
body::Bytes, headers::Header, http::HeaderName, routing::post, Extension, Router, TypedHeader,
};
use hyper::StatusCode;
use hyper::{HeaderMap, StatusCode};
use serde::{Serialize, Serializer};
use sha2::{Digest, Sha256};
use telemetry_events::{
ActionEvent, AppEvent, AssistantEvent, CallEvent, CopilotEvent, CpuEvent, EditEvent,
EditorEvent, Event, EventRequestBody, EventWrapper, MemoryEvent, SettingEvent,
};
use util::SemanticVersion;

use crate::{AppState, Error, Result};
use crate::{api::slack, AppState, Error, Result};

use super::ips_file::IpsFile;

pub fn router() -> Router {
Router::new().route("/telemetry/events", post(post_events))
Router::new()
.route("/telemetry/events", post(post_events))
.route("/telemetry/crashes", post(post_crash))
}

pub struct ZedChecksumHeader(Vec<u8>);
Expand Down Expand Up @@ -73,6 +79,140 @@ impl Header for CloudflareIpCountryHeader {
}
}

pub async fn post_crash(
Extension(app): Extension<Arc<AppState>>,
body: Bytes,
headers: HeaderMap,
) -> Result<()> {
static CRASH_REPORTS_BUCKET: &str = "zed-crash-reports";

let report = IpsFile::parse(&body)?;
let version_threshold = SemanticVersion::new(0, 123, 0);

let bundle_id = &report.header.bundle_id;
let app_version = &report.app_version();

if bundle_id == "dev.zed.Zed-Dev" {
log::error!("Crash uploads from {} are ignored.", bundle_id);
return Ok(());
}

if app_version.is_none() || app_version.unwrap() < version_threshold {
log::error!(
"Crash uploads from {} are ignored.",
report.header.app_version
);
return Ok(());
}
let app_version = app_version.unwrap();

if let Some(blob_store_client) = app.blob_store_client.as_ref() {
let response = blob_store_client
.head_object()
.bucket(CRASH_REPORTS_BUCKET)
.key(report.header.incident_id.clone() + ".ips")
.send()
.await;

if response.is_ok() {
log::info!("We've already uploaded this crash");
return Ok(());
}

blob_store_client
.put_object()
.bucket(CRASH_REPORTS_BUCKET)
.key(report.header.incident_id.clone() + ".ips")
.acl(aws_sdk_s3::types::ObjectCannedAcl::PublicRead)
.body(ByteStream::from(body.to_vec()))
.send()
.await
.map_err(|e| log::error!("Failed to upload crash: {}", e))
.ok();
}

let recent_panic_on: Option<i64> = headers
.get("x-zed-panicked-on")
.and_then(|h| h.to_str().ok())
.and_then(|s| s.parse().ok());
let mut recent_panic = None;

if let Some(recent_panic_on) = recent_panic_on {
let crashed_at = match report.timestamp() {
Ok(t) => Some(t),
Err(e) => {
log::error!("Can't parse {}: {}", report.header.timestamp, e);
None
}
};
if crashed_at.is_some_and(|t| (t.timestamp_millis() - recent_panic_on).abs() <= 30000) {
recent_panic = headers.get("x-zed-panic").and_then(|h| h.to_str().ok());
}
}

let description = report.description(recent_panic);
let summary = report.backtrace_summary();

tracing::error!(
service = "client",
version = %report.header.app_version,
os_version = %report.header.os_version,
bundle_id = %report.header.bundle_id,
incident_id = %report.header.incident_id,
description = %description,
backtrace = %summary,
"crash report");

if let Some(slack_panics_webhook) = app.config.slack_panics_webhook.clone() {
let payload = slack::WebhookBody::new(|w| {
w.add_section(|s| s.text(slack::Text::markdown(description)))
.add_section(|s| {
s.add_field(slack::Text::markdown(format!(
"*Version:*\n{} ({})",
bundle_id, app_version
)))
.add_field({
let hostname = app.config.blob_store_url.clone().unwrap_or_default();
let hostname = hostname.strip_prefix("https://").unwrap_or_else(|| {
hostname.strip_prefix("http://").unwrap_or_default()
});

slack::Text::markdown(format!(
"*Incident:*\n<https://{}.{}/{}.ips|{}…>",
CRASH_REPORTS_BUCKET,
hostname,
report.header.incident_id,
report
.header
.incident_id
.chars()
.take(8)
.collect::<String>(),
))
})
})
.add_rich_text(|r| r.add_preformatted(|p| p.add_text(summary)))
});
let payload_json = serde_json::to_string(&payload).map_err(|err| {
log::error!("Failed to serialize payload to JSON: {err}");
Error::Internal(anyhow!(err))
})?;

reqwest::Client::new()
.post(slack_panics_webhook)
.header("Content-Type", "application/json")
.body(payload_json)
.send()
.await
.map_err(|err| {
log::error!("Failed to send payload to Slack: {err}");
Error::Internal(anyhow!(err))
})?;
}

Ok(())
}

pub async fn post_events(
Extension(app): Extension<Arc<AppState>>,
TypedHeader(ZedChecksumHeader(checksum)): TypedHeader<ZedChecksumHeader>,
Expand All @@ -98,7 +238,7 @@ pub async fn post_events(
summer.update(&body);
summer.update(checksum_seed);

if &checksum[..] != &summer.finalize()[..] {
if &checksum != &summer.finalize()[..] {
return Err(Error::Http(
StatusCode::BAD_REQUEST,
"invalid checksum".into(),
Expand Down
Loading

0 comments on commit 64460e4

Please sign in to comment.