From e90308ba9e25eb46bcc18bc28ba1dd9c6472ce3b Mon Sep 17 00:00:00 2001 From: lewis Date: Sun, 28 Dec 2025 12:50:22 +0200 Subject: [PATCH] No slur handles allowed --- Cargo.lock | 1 + Cargo.toml | 1 + frontend/src/lib/api.ts | 2 +- src/api/identity/account.rs | 10 +- src/api/identity/did.rs | 7 + src/api/repo/record/batch.rs | 8 +- src/api/repo/record/validation.rs | 14 +- src/api/repo/record/write.rs | 8 +- src/api/validation.rs | 6 + src/lib.rs | 1 + src/moderation/mod.rs | 262 ++++++++++++++++++++++++++++++ src/validation/mod.rs | 149 ++++++++++++++--- tests/banned_words.rs | 196 ++++++++++++++++++++++ 13 files changed, 634 insertions(+), 31 deletions(-) create mode 100644 src/moderation/mod.rs create mode 100644 tests/banned_words.rs diff --git a/Cargo.lock b/Cargo.lock index 764634d..0d43607 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6305,6 +6305,7 @@ dependencies = [ "p384", "rand 0.8.5", "redis", + "regex", "reqwest", "serde", "serde_bytes", diff --git a/Cargo.toml b/Cargo.toml index fbe7616..4da3c85 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ k256 = { version = "0.13.3", features = ["ecdsa", "pem", "pkcs8"] } multibase = "0.9.1" multihash = "0.19.3" rand = "0.8.5" +regex = "1" reqwest = { version = "0.12.28", features = ["json"] } serde = { version = "1.0.228", features = ["derive"] } serde_bytes = "0.11.14" diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index ac4529f..178b050 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -175,7 +175,7 @@ export const api = { }); const data = await response.json(); if (!response.ok) { - throw new ApiError(data.error, data.message, response.status); + throw new ApiError(response.status, data.error, data.message); } return data; }, diff --git a/src/api/identity/account.rs b/src/api/identity/account.rs index 4e2aa04..686cbe9 100644 --- a/src/api/identity/account.rs +++ b/src/api/identity/account.rs @@ -194,7 +194,15 @@ pub async fn create_account( .into_response(); } } - input.handle.to_lowercase() + let handle_lower = input.handle.to_lowercase(); + if crate::moderation::has_explicit_slur(&handle_lower) { + return ( + StatusCode::BAD_REQUEST, + Json(json!({"error": "InvalidHandle", "message": "Inappropriate language in handle"})), + ) + .into_response(); + } + handle_lower }; let email: Option = input .email diff --git a/src/api/identity/did.rs b/src/api/identity/did.rs index 754d943..4c67062 100644 --- a/src/api/identity/did.rs +++ b/src/api/identity/did.rs @@ -582,6 +582,13 @@ pub async fn update_handle( ) .into_response(); } + if crate::moderation::has_explicit_slur(new_handle) { + return ( + StatusCode::BAD_REQUEST, + Json(json!({"error": "InvalidHandle", "message": "Inappropriate language in handle"})), + ) + .into_response(); + } let hostname = std::env::var("PDS_HOSTNAME").unwrap_or_else(|_| "localhost".to_string()); let suffix = format!(".{}", hostname); let is_service_domain = crate::handle::is_service_domain_handle(new_handle, &hostname); diff --git a/src/api/repo/record/batch.rs b/src/api/repo/record/batch.rs index 91d7641..5eaa9af 100644 --- a/src/api/repo/record/batch.rs +++ b/src/api/repo/record/batch.rs @@ -1,4 +1,4 @@ -use super::validation::validate_record; +use super::validation::validate_record_with_rkey; use super::write::has_verified_comms_channel; use crate::api::repo::record::utils::{CommitParams, RecordOp, commit_and_log, extract_blob_cids}; use crate::delegation::{self, DelegationActionType}; @@ -304,7 +304,8 @@ pub async fn apply_writes( value, } => { if input.validate.unwrap_or(true) - && let Err(err_response) = validate_record(value, collection) + && let Err(err_response) = + validate_record_with_rkey(value, collection, rkey.as_deref()) { return *err_response; } @@ -357,7 +358,8 @@ pub async fn apply_writes( value, } => { if input.validate.unwrap_or(true) - && let Err(err_response) = validate_record(value, collection) + && let Err(err_response) = + validate_record_with_rkey(value, collection, Some(rkey)) { return *err_response; } diff --git a/src/api/repo/record/validation.rs b/src/api/repo/record/validation.rs index 84a6bcc..c705abb 100644 --- a/src/api/repo/record/validation.rs +++ b/src/api/repo/record/validation.rs @@ -7,8 +7,16 @@ use axum::{ use serde_json::json; pub fn validate_record(record: &serde_json::Value, collection: &str) -> Result<(), Box> { + validate_record_with_rkey(record, collection, None) +} + +pub fn validate_record_with_rkey( + record: &serde_json::Value, + collection: &str, + rkey: Option<&str>, +) -> Result<(), Box> { let validator = RecordValidator::new(); - match validator.validate(record, collection) { + match validator.validate_with_rkey(record, collection, rkey) { Ok(_) => Ok(()), Err(ValidationError::MissingType) => Err(Box::new(( StatusCode::BAD_REQUEST, @@ -30,6 +38,10 @@ pub fn validate_record(record: &serde_json::Value, collection: &str) -> Result<( StatusCode::BAD_REQUEST, Json(json!({"error": "InvalidRecord", "message": format!("Invalid datetime format at '{}'", path)})), ).into_response())), + Err(ValidationError::BannedContent { path }) => Err(Box::new(( + StatusCode::BAD_REQUEST, + Json(json!({"error": "InvalidRecord", "message": format!("Unacceptable slur in record at '{}'", path)})), + ).into_response())), Err(e) => Err(Box::new(( StatusCode::BAD_REQUEST, Json(json!({"error": "InvalidRecord", "message": e.to_string()})), diff --git a/src/api/repo/record/write.rs b/src/api/repo/record/write.rs index 980a0d4..69140a7 100644 --- a/src/api/repo/record/write.rs +++ b/src/api/repo/record/write.rs @@ -1,4 +1,4 @@ -use super::validation::validate_record; +use super::validation::validate_record_with_rkey; use crate::api::repo::record::utils::{CommitParams, RecordOp, commit_and_log, extract_blob_cids}; use crate::delegation::{self, DelegationActionType}; use crate::repo::tracking::TrackingBlockStore; @@ -257,7 +257,8 @@ pub async fn create_record( } }; if input.validate.unwrap_or(true) - && let Err(err_response) = validate_record(&input.record, &input.collection) + && let Err(err_response) = + validate_record_with_rkey(&input.record, &input.collection, input.rkey.as_deref()) { return *err_response; } @@ -480,7 +481,8 @@ pub async fn put_record( }; let key = format!("{}/{}", collection_nsid, input.rkey); if input.validate.unwrap_or(true) - && let Err(err_response) = validate_record(&input.record, &input.collection) + && let Err(err_response) = + validate_record_with_rkey(&input.record, &input.collection, Some(&input.rkey)) { return *err_response; } diff --git a/src/api/validation.rs b/src/api/validation.rs index 00b2347..8d73839 100644 --- a/src/api/validation.rs +++ b/src/api/validation.rs @@ -16,6 +16,7 @@ pub enum HandleValidationError { StartsWithInvalidChar, EndsWithInvalidChar, ContainsSpaces, + BannedWord, } impl std::fmt::Display for HandleValidationError { @@ -41,6 +42,7 @@ impl std::fmt::Display for HandleValidationError { } Self::EndsWithInvalidChar => write!(f, "Handle cannot end with a hyphen or underscore"), Self::ContainsSpaces => write!(f, "Handle cannot contain spaces"), + Self::BannedWord => write!(f, "Inappropriate language in handle"), } } } @@ -82,6 +84,10 @@ pub fn validate_short_handle(handle: &str) -> Result> = OnceLock::new(); +static EXTRA_BANNED_WORDS: OnceLock> = OnceLock::new(); + +fn get_slur_regexes() -> &'static Vec { + SLUR_REGEXES.get_or_init(|| { + vec![ + Regex::new(r"\b[cĆćĈĉČčĊċÇçḈḉȻȼꞒꞓꟄꞔƇƈɕ][hĤĥȞȟḦḧḢḣḨḩḤḥḪḫH̱ẖĦħⱧⱨꞪɦꞕΗНн][iÍíi̇́Ììi̇̀ĬĭÎîǏǐÏïḮḯĨĩi̇̃ĮįĮ́į̇́Į̃į̇̃ĪīĪ̀ī̀ỈỉȈȉI̋i̋ȊȋỊịꞼꞽḬḭƗɨᶖİiIıIi1lĺľļḷḹl̃ḽḻłŀƚꝉⱡɫɬꞎꬷꬸꬹᶅɭȴLl][nŃńǸǹŇňÑñṄṅŅņṆṇṊṋṈṉN̈n̈ƝɲŊŋꞐꞑꞤꞥᵰᶇɳȵꬻꬼИиПпNn][kḰḱǨǩĶķḲḳḴḵƘƙⱩⱪᶄꝀꝁꝂꝃꝄꝅꞢꞣ][sŚśṤṥŜŝŠšṦṧṠṡŞşṢṣṨṩȘșS̩s̩ꞨꞩⱾȿꟅʂᶊᵴ]?\b").unwrap(), + Regex::new(r"\b[cĆćĈĉČčĊċÇçḈḉȻȼꞒꞓꟄꞔƇƈɕ][ÓóÒòŎŏÔôỐốỒồỖỗỔổǑǒÖöȪȫŐőÕõṌṍṎṏȬȭȮȯO͘o͘ȰȱØøǾǿǪǫǬǭŌōṒṓṐṑỎỏȌȍȎȏƠơỚớỜờỠỡỞởỢợỌọỘộO̩o̩Ò̩ò̩Ó̩ó̩ƟɵꝊꝋꝌꝍⱺOo0]{2}[nŃńǸǹŇňÑñṄṅŅņṆṇṊṋṈṉN̈n̈ƝɲŊŋꞐꞑꞤꞥᵰᶇɳȵꬻꬼИиПпNn][sŚśṤṥŜŝŠšṦṧṠṡŞşṢṣṨṩȘșS̩s̩ꞨꞩⱾȿꟅʂᶊᵴ]?\b").unwrap(), + Regex::new(r"\b[fḞḟƑƒꞘꞙᵮᶂ][aÁáÀàĂăẮắẰằẴẵẲẳÂâẤấẦầẪẫẨẩǍǎÅåǺǻÄäǞǟÃãȦȧǠǡĄąĄ́ą́Ą̃ą̃ĀāĀ̀ā̀ẢảȀȁA̋a̋ȂȃẠạẶặẬậḀḁȺⱥꞺꞻᶏẚAa@4][gǴǵĞğĜĝǦǧĠġG̃g̃ĢģḠḡǤǥꞠꞡƓɠᶃꬶGg]{1,2}([ÓóÒòŎŏÔôỐốỒồỖỗỔổǑǒÖöȪȫŐőÕõṌṍṎṏȬȭȮȯO͘o͘ȰȱØøǾǿǪǫǬǭŌōṒṓṐṑỎỏȌȍȎȏƠơỚớỜờỠỡỞởỢợỌọỘộO̩o̩Ò̩ò̩Ó̩ó̩ƟɵꝊꝋꝌꝍⱺOo0e3ЄєЕеÉéÈèĔĕÊêẾếỀềỄễỂểÊ̄ê̄Ê̌ê̌ĚěËëẼẽĖėĖ́ė́Ė̃ė̃ȨȩḜḝĘęĘ́ę́Ę̃ę̃ĒēḖḗḔḕẺẻȄȅE̋e̋ȆȇẸẹỆệḘḙḚḛɆɇE̩e̩È̩è̩É̩é̩ᶒⱸꬴꬳEeiÍíi̇́Ììi̇̀ĬĭÎîǏǐÏïḮḯĨĩi̇̃ĮįĮ́į̇́Į̃į̇̃ĪīĪ̀ī̀ỈỉȈȉI̋i̋ȊȋỊịꞼꞽḬḭƗɨᶖİiIıIi1lĺľļḷḹl̃ḽḻłŀƚꝉⱡɫɬꞎꬷꬸꬹᶅɭȴLl][tŤťṪṫŢţṬṭȚțṰṱṮṯŦŧȾⱦƬƭƮʈT̈ẗᵵƫȶ]{1,2}([rŔŕŘřṘṙŖŗȐȑȒȓṚṛṜṝṞṟR̃r̃ɌɍꞦꞧⱤɽᵲᶉꭉ][yÝýỲỳŶŷY̊ẙŸÿỸỹẎẏȲȳỶỷỴỵɎɏƳƴỾỿ]|[rŔŕŘřṘṙŖŗȐȑȒȓṚṛṜṝṞṟR̃r̃ɌɍꞦꞧⱤɽᵲᶉꭉ][iÍíi̇́Ììi̇̀ĬĭÎîǏǐÏïḮḯĨĩi̇̃ĮįĮ́į̇́Į̃į̇̃ĪīĪ̀ī̀ỈỉȈȉI̋i̋ȊȋỊịꞼꞽḬḭƗɨᶖİiIıIi1lĺľļḷḹl̃ḽḻłŀƚꝉⱡɫɬꞎꬷꬸꬹᶅɭȴLl][e3ЄєЕеÉéÈèĔĕÊêẾếỀềỄễỂểÊ̄ê̄Ê̌ê̌ĚěËëẼẽĖėĖ́ė́Ė̃ė̃ȨȩḜḝĘęĘ́ę́Ę̃ę̃ĒēḖḗḔḕẺẻȄȅE̋e̋ȆȇẸẹỆệḘḙḚḛɆɇE̩e̩È̩è̩É̩é̩ᶒⱸꬴꬳEe])?)?[sŚśṤṥŜŝŠšṦṧṠṡŞşṢṣṨṩȘșS̩s̩ꞨꞩⱾȿꟅʂᶊᵴ]?\b").unwrap(), + Regex::new(r"\b[kḰḱǨǩĶķḲḳḴḵƘƙⱩⱪᶄꝀꝁꝂꝃꝄꝅꞢꞣ][iÍíi̇́Ììi̇̀ĬĭÎîǏǐÏïḮḯĨĩi̇̃ĮįĮ́į̇́Į̃į̇̃ĪīĪ̀ī̀ỈỉȈȉI̋i̋ȊȋỊịꞼꞽḬḭƗɨᶖİiIıIi1lĺľļḷḹl̃ḽḻłŀƚꝉⱡɫɬꞎꬷꬸꬹᶅɭȴLlyÝýỲỳŶŷY̊ẙŸÿỸỹẎẏȲȳỶỷỴỵɎɏƳƴỾỿ][kḰḱǨǩĶķḲḳḴḵƘƙⱩⱪᶄꝀꝁꝂꝃꝄꝅꞢꞣ][e3ЄєЕеÉéÈèĔĕÊêẾếỀềỄễỂểÊ̄ê̄Ê̌ê̌ĚěËëẼẽĖėĖ́ė́Ė̃ė̃ȨȩḜḝĘęĘ́ę́Ę̃ę̃ĒēḖḗḔḕẺẻȄȅE̋e̋ȆȇẸẹỆệḘḙḚḛɆɇE̩e̩È̩è̩É̩é̩ᶒⱸꬴꬳEe]([rŔŕŘřṘṙŖŗȐȑȒȓṚṛṜṝṞṟR̃r̃ɌɍꞦꞧⱤɽᵲᶉꭉ][yÝýỲỳŶŷY̊ẙŸÿỸỹẎẏȲȳỶỷỴỵɎɏƳƴỾỿ]|[rŔŕŘřṘṙŖŗȐȑȒȓṚṛṜṝṞṟR̃r̃ɌɍꞦꞧⱤɽᵲᶉꭉ][iÍíi̇́Ììi̇̀ĬĭÎîǏǐÏïḮḯĨĩi̇̃ĮįĮ́į̇́Į̃į̇̃ĪīĪ̀ī̀ỈỉȈȉI̋i̋ȊȋỊịꞼꞽḬḭƗɨᶖİiIıIi1lĺľļḷḹl̃ḽḻłŀƚꝉⱡɫɬꞎꬷꬸꬹᶅɭȴLl][e3ЄєЕеÉéÈèĔĕÊêẾếỀềỄễỂểÊ̄ê̄Ê̌ê̌ĚěËëẼẽĖėĖ́ė́Ė̃ė̃ȨȩḜḝĘęĘ́ę́Ę̃ę̃ĒēḖḗḔḕẺẻȄȅE̋e̋ȆȇẸẹỆệḘḙḚḛɆɇE̩e̩È̩è̩É̩é̩ᶒⱸꬴꬳEe])?[sŚśṤṥŜŝŠšṦṧṠṡŞşṢṣṨṩȘșS̩s̩ꞨꞩⱾȿꟅʂᶊᵴ]*\b").unwrap(), + Regex::new(r"\b[nŃńǸǹŇňÑñṄṅŅņṆṇṊṋṈṉN̈n̈ƝɲŊŋꞐꞑꞤꞥᵰᶇɳȵꬻꬼИиПпNn][iÍíi̇́Ììi̇̀ĬĭÎîǏǐÏïḮḯĨĩi̇̃ĮįĮ́į̇́Į̃į̇̃ĪīĪ̀ī̀ỈỉȈȉI̋i̋ȊȋỊịꞼꞽḬḭƗɨᶖİiIıIi1lĺľļḷḹl̃ḽḻłŀƚꝉⱡɫɬꞎꬷꬸꬹᶅɭȴLloÓóÒòŎŏÔôỐốỒồỖỗỔổǑǒÖöȪȫŐőÕõṌṍṎṏȬȭȮȯO͘o͘ȰȱØøǾǿǪǫǬǭŌōṒṓṐṑỎỏȌȍȎȏƠơỚớỜờỠỡỞởỢợỌọỘộO̩o̩Ò̩ò̩Ó̩ó̩ƟɵꝊꝋꝌꝍⱺOoІіa4ÁáÀàĂăẮắẰằẴẵẲẳÂâẤấẦầẪẫẨẩǍǎÅåǺǻÄäǞǟÃãȦȧǠǡĄąĄ́ą́Ą̃ą̃ĀāĀ̀ā̀ẢảȀȁA̋a̋ȂȃẠạẶặẬậḀḁȺⱥꞺꞻᶏẚAa][gǴǵĞğĜĝǦǧĠġG̃g̃ĢģḠḡǤǥꞠꞡƓɠᶃꬶGgqꝖꝗꝘꝙɋʠ]{2}(l[e3ЄєЕеÉéÈèĔĕÊêẾếỀềỄễỂểÊ̄ê̄Ê̌ê̌ĚěËëẼẽĖėĖ́ė́Ė̃ė̃ȨȩḜḝĘęĘ́ę́Ę̃ę̃ĒēḖḗḔḕẺẻȄȅE̋e̋ȆȇẸẹỆệḘḙḚḛɆɇE̩e̩È̩è̩É̩é̩ᶒⱸꬴꬳEe]t|[e3ЄєЕеÉéÈèĔĕÊêẾếỀềỄễỂểÊ̄ê̄Ê̌ê̌ĚěËëẼẽĖėĖ́ė́Ė̃ė̃ȨȩḜḝĘęĘ́ę́Ę̃ę̃ĒēḖḗḔḕẺẻȄȅE̋e̋ȆȇẸẹỆệḘḙḚḛɆɇE̩e̩È̩è̩É̩é̩ᶒⱸꬴꬳEeaÁáÀàĂăẮắẰằẴẵẲẳÂâẤấẦầẪẫẨẩǍǎÅåǺǻÄäǞǟÃãȦȧǠǡĄąĄ́ą́Ą̃ą̃ĀāĀ̀ā̀ẢảȀȁA̋a̋ȂȃẠạẶặẬậḀḁȺⱥꞺꞻᶏẚAa][rŔŕŘřṘṙŖŗȐȑȒȓṚṛṜṝṞṟR̃r̃ɌɍꞦꞧⱤɽᵲᶉꭉ]?|n[ÓóÒòŎŏÔôỐốỒồỖỗỔổǑǒÖöȪȫŐőÕõṌṍṎṏȬȭȮȯO͘o͘ȰȱØøǾǿǪǫǬǭŌōṒṓṐṑỎỏȌȍȎȏƠơỚớỜờỠỡỞởỢợỌọỘộO̩o̩Ò̩ò̩Ó̩ó̩ƟɵꝊꝋꝌꝍⱺOo0][gǴǵĞğĜĝǦǧĠġG̃g̃ĢģḠḡǤǥꞠꞡƓɠᶃꬶGgqꝖꝗꝘꝙɋʠ]|[a4ÁáÀàĂăẮắẰằẴẵẲẳÂâẤấẦầẪẫẨẩǍǎÅåǺǻÄäǞǟÃãȦȧǠǡĄąĄ́ą́Ą̃ą̃ĀāĀ̀ā̀ẢảȀȁA̋a̋ȂȃẠạẶặẬậḀḁȺⱥꞺꞻᶏẚAa]?)?[sŚśṤṥŜŝŠšṦṧṠṡŞşṢṣṨṩȘșS̩s̩ꞨꞩⱾȿꟅʂᶊᵴ]?\b").unwrap(), + Regex::new(r"[nŃńǸǹŇňÑñṄṅŅņṆṇṊṋṈṉN̈n̈ƝɲŊŋꞐꞑꞤꞥᵰᶇɳȵꬻꬼИиПпNn][iÍíi̇́Ììi̇̀ĬĭÎîǏǐÏïḮḯĨĩi̇̃ĮįĮ́į̇́Į̃į̇̃ĪīĪ̀ī̀ỈỉȈȉI̋i̋ȊȋỊịꞼꞽḬḭƗɨᶖİiIıIi1lĺľļḷḹl̃ḽḻłŀƚꝉⱡɫɬꞎꬷꬸꬹᶅɭȴLloÓóÒòŎŏÔôỐốỒồỖỗỔổǑǒÖöȪȫŐőÕõṌṍṎṏȬȭȮȯO͘o͘ȰȱØøǾǿǪǫǬǭŌōṒṓṐṑỎỏȌȍȎȏƠơỚớỜờỠỡỞởỢợỌọỘộO̩o̩Ò̩ò̩Ó̩ó̩ƟɵꝊꝋꝌꝍⱺOoІіa4ÁáÀàĂăẮắẰằẴẵẲẳÂâẤấẦầẪẫẨẩǍǎÅåǺǻÄäǞǟÃãȦȧǠǡĄąĄ́ą́Ą̃ą̃ĀāĀ̀ā̀ẢảȀȁA̋a̋ȂȃẠạẶặẬậḀḁȺⱥꞺꞻᶏẚAa][gǴǵĞğĜĝǦǧĠġG̃g̃ĢģḠḡǤǥꞠꞡƓɠᶃꬶGgqꝖꝗꝘꝙɋʠ]{2}(l[e3ЄєЕеÉéÈèĔĕÊêẾếỀềỄễỂểÊ̄ê̄Ê̌ê̌ĚěËëẼẽĖėĖ́ė́Ė̃ė̃ȨȩḜḝĘęĘ́ę́Ę̃ę̃ĒēḖḗḔḕẺẻȄȅE̋e̋ȆȇẸẹỆệḘḙḚḛɆɇE̩e̩È̩è̩É̩é̩ᶒⱸꬴꬳEe]t|[e3ЄєЕеÉéÈèĔĕÊêẾếỀềỄễỂểÊ̄ê̄Ê̌ê̌ĚěËëẼẽĖėĖ́ė́Ė̃ė̃ȨȩḜḝĘęĘ́ę́Ę̃ę̃ĒēḖḗḔḕẺẻȄȅE̋e̋ȆȇẸẹỆệḘḙḚḛɆɇE̩e̩È̩è̩É̩é̩ᶒⱸꬴꬳEe][rŔŕŘřṘṙŖŗȐȑȒȓṚṛṜṝṞṟR̃r̃ɌɍꞦꞧⱤɽᵲᶉꭉ])[sŚśṤṥŜŝŠšṦṧṠṡŞşṢṣṨṩȘșS̩s̩ꞨꞩⱾȿꟅʂᶊᵴ]?").unwrap(), + Regex::new(r"\b[tŤťṪṫŢţṬṭȚțṰṱṮṯŦŧȾⱦƬƭƮʈT̈ẗᵵƫȶ][rŔŕŘřṘṙŖŗȐȑȒȓṚṛṜṝṞṟR̃r̃ɌɍꞦꞧⱤɽᵲᶉꭉ][aÁáÀàĂăẮắẰằẴẵẲẳÂâẤấẦầẪẫẨẩǍǎÅåǺǻÄäǞǟÃãȦȧǠǡĄąĄ́ą́Ą̃ą̃ĀāĀ̀ā̀ẢảȀȁA̋a̋ȂȃẠạẶặẬậḀḁȺⱥꞺꞻᶏẚAa4]+[nŃńǸǹŇňÑñṄṅŅņṆṇṊṋṈṉN̈n̈ƝɲŊŋꞐꞑꞤꞥᵰᶇɳȵꬻꬼИиПпNn]{1,2}([iÍíi̇́Ììi̇̀ĬĭÎîǏǐÏïḮḯĨĩi̇̃ĮįĮ́į̇́Į̃į̇̃ĪīĪ̀ī̀ỈỉȈȉI̋i̋ȊȋỊịꞼꞽḬḭƗɨᶖİiIıIi1lĺľļḷḹl̃ḽḻłŀƚꝉⱡɫɬꞎꬷꬸꬹᶅɭȴLl][e3ЄєЕеÉéÈèĔĕÊêẾếỀềỄễỂểÊ̄ê̄Ê̌ê̌ĚěËëẼẽĖėĖ́ė́Ė̃ė̃ȨȩḜḝĘęĘ́ę́Ę̃ę̃ĒēḖḗḔḕẺẻȄȅE̋e̋ȆȇẸẹỆệḘḙḚḛɆɇE̩e̩È̩è̩É̩é̩ᶒⱸꬴꬳEe]|[yÝýỲỳŶŷY̊ẙŸÿỸỹẎẏȲȳỶỷỴỵɎɏƳƴỾỿ]|[e3ЄєЕеÉéÈèĔĕÊêẾếỀềỄễỂểÊ̄ê̄Ê̌ê̌ĚěËëẼẽĖėĖ́ė́Ė̃ė̃ȨȩḜḝĘęĘ́ę́Ę̃ę̃ĒēḖḗḔḕẺẻȄȅE̋e̋ȆȇẸẹỆệḘḙḚḛɆɇE̩e̩È̩è̩É̩é̩ᶒⱸꬴꬳEe][rŔŕŘřṘṙŖŗȐȑȒȓṚṛṜṝṞṟR̃r̃ɌɍꞦꞧⱤɽᵲᶉꭉ])[sŚśṤṥŜŝŠšṦṧṠṡŞşṢṣṨṩȘșS̩s̩ꞨꞩⱾȿꟅʂᶊᵴ]?\b").unwrap(), + ] + }) +} + +fn get_extra_banned_words() -> &'static Vec { + EXTRA_BANNED_WORDS.get_or_init(|| { + std::env::var("PDS_BANNED_WORDS") + .unwrap_or_default() + .split(',') + .map(|s| s.trim().to_lowercase()) + .filter(|s| !s.is_empty()) + .collect() + }) +} + +fn strip_trailing_digits(s: &str) -> &str { + s.trim_end_matches(|c: char| c.is_ascii_digit()) +} + +fn normalize_leetspeak(s: &str) -> String { + s.chars() + .map(|c| match c { + '4' | '@' => 'a', + '3' => 'e', + '1' | '!' | '|' => 'i', + '0' => 'o', + '5' | '$' => 's', + '7' => 't', + '8' => 'b', + '9' => 'g', + _ => c, + }) + .collect() +} + +pub fn has_explicit_slur(text: &str) -> bool { + has_explicit_slur_with_extra_words(text, get_extra_banned_words()) +} + +fn has_explicit_slur_with_extra_words(text: &str, extra_words: &[String]) -> bool { + let text_lower = text.to_lowercase(); + let normalized = text_lower.replace('.', "").replace('-', "").replace('_', ""); + let stripped = strip_trailing_digits(&text_lower); + let normalized_stripped = strip_trailing_digits(&normalized); + + let regexes = get_slur_regexes(); + if regexes.iter().any(|r| { + r.is_match(&text_lower) + || r.is_match(&normalized) + || r.is_match(stripped) + || r.is_match(normalized_stripped) + }) { + return true; + } + + if !extra_words.is_empty() { + let leet_normalized = normalize_leetspeak(&normalized); + let leet_stripped = normalize_leetspeak(strip_trailing_digits(&leet_normalized)); + if extra_words.iter().any(|w| { + text_lower.contains(w) + || normalized.contains(w) + || stripped.contains(w) + || normalized_stripped.contains(w) + || leet_normalized.contains(w) + || leet_stripped.contains(w) + }) { + return true; + } + } + false +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_chink_pattern() { + assert!(has_explicit_slur("chink")); + assert!(has_explicit_slur("chinks")); + assert!(has_explicit_slur("CHINK")); + assert!(has_explicit_slur("Chinks")); + } + + #[test] + fn test_coon_pattern() { + assert!(has_explicit_slur("coon")); + assert!(has_explicit_slur("coons")); + assert!(has_explicit_slur("COON")); + } + + #[test] + fn test_fag_pattern() { + assert!(has_explicit_slur("fag")); + assert!(has_explicit_slur("fags")); + assert!(has_explicit_slur("faggot")); + assert!(has_explicit_slur("faggots")); + assert!(has_explicit_slur("faggotry")); + } + + #[test] + fn test_kike_pattern() { + assert!(has_explicit_slur("kike")); + assert!(has_explicit_slur("kikes")); + assert!(has_explicit_slur("KIKE")); + assert!(has_explicit_slur("kikery")); + } + + #[test] + fn test_nigger_pattern() { + assert!(has_explicit_slur("nigger")); + assert!(has_explicit_slur("niggers")); + assert!(has_explicit_slur("NIGGER")); + assert!(has_explicit_slur("nigga")); + assert!(has_explicit_slur("niggas")); + } + + #[test] + fn test_tranny_pattern() { + assert!(has_explicit_slur("tranny")); + assert!(has_explicit_slur("trannies")); + assert!(has_explicit_slur("TRANNY")); + } + + #[test] + fn test_normalization_bypass() { + assert!(has_explicit_slur("n.i.g.g.e.r")); + assert!(has_explicit_slur("n-i-g-g-e-r")); + assert!(has_explicit_slur("n_i_g_g_e_r")); + assert!(has_explicit_slur("f.a.g")); + assert!(has_explicit_slur("f-a-g")); + assert!(has_explicit_slur("c.h.i.n.k")); + assert!(has_explicit_slur("k_i_k_e")); + } + + #[test] + fn test_trailing_digits_bypass() { + assert!(has_explicit_slur("faggot123")); + assert!(has_explicit_slur("nigger69")); + assert!(has_explicit_slur("chink420")); + assert!(has_explicit_slur("fag1")); + assert!(has_explicit_slur("kike2024")); + assert!(has_explicit_slur("n_i_g_g_e_r123")); + } + + #[test] + fn test_embedded_in_sentence() { + assert!(has_explicit_slur("you are a faggot")); + assert!(has_explicit_slur("stupid nigger")); + assert!(has_explicit_slur("go away chink")); + } + + #[test] + fn test_safe_words_not_matched() { + assert!(!has_explicit_slur("hello")); + assert!(!has_explicit_slur("world")); + assert!(!has_explicit_slur("bluesky")); + assert!(!has_explicit_slur("tranquil")); + assert!(!has_explicit_slur("programmer")); + assert!(!has_explicit_slur("trigger")); + assert!(!has_explicit_slur("bigger")); + assert!(!has_explicit_slur("digger")); + assert!(!has_explicit_slur("figure")); + assert!(!has_explicit_slur("configure")); + } + + #[test] + fn test_similar_but_safe_words() { + assert!(!has_explicit_slur("niggardly")); + assert!(!has_explicit_slur("raccoon")); + } + + #[test] + fn test_empty_and_whitespace() { + assert!(!has_explicit_slur("")); + assert!(!has_explicit_slur(" ")); + assert!(!has_explicit_slur("\t\n")); + } + + #[test] + fn test_case_insensitive() { + assert!(has_explicit_slur("NIGGER")); + assert!(has_explicit_slur("Nigger")); + assert!(has_explicit_slur("NiGgEr")); + assert!(has_explicit_slur("FAGGOT")); + assert!(has_explicit_slur("Faggot")); + } + + #[test] + fn test_leetspeak_bypass() { + assert!(has_explicit_slur("f4ggot")); + assert!(has_explicit_slur("f4gg0t")); + assert!(has_explicit_slur("n1gger")); + assert!(has_explicit_slur("n1gg3r")); + assert!(has_explicit_slur("k1ke")); + assert!(has_explicit_slur("ch1nk")); + assert!(has_explicit_slur("tr4nny")); + } + + #[test] + fn test_normalize_leetspeak() { + assert_eq!(normalize_leetspeak("h3llo"), "hello"); + assert_eq!(normalize_leetspeak("w0rld"), "world"); + assert_eq!(normalize_leetspeak("t3$t"), "test"); + assert_eq!(normalize_leetspeak("b4dw0rd"), "badword"); + assert_eq!(normalize_leetspeak("l33t5p34k"), "leetspeak"); + assert_eq!(normalize_leetspeak("@ss"), "ass"); + assert_eq!(normalize_leetspeak("sh!t"), "shit"); + assert_eq!(normalize_leetspeak("normal"), "normal"); + } + + #[test] + fn test_extra_banned_words() { + let extra = vec!["badword".to_string(), "offensive".to_string()]; + + assert!(has_explicit_slur_with_extra_words("badword", &extra)); + assert!(has_explicit_slur_with_extra_words("BADWORD", &extra)); + assert!(has_explicit_slur_with_extra_words("b.a.d.w.o.r.d", &extra)); + assert!(has_explicit_slur_with_extra_words("b-a-d-w-o-r-d", &extra)); + assert!(has_explicit_slur_with_extra_words("b_a_d_w_o_r_d", &extra)); + assert!(has_explicit_slur_with_extra_words("badword123", &extra)); + assert!(has_explicit_slur_with_extra_words("b4dw0rd", &extra)); + assert!(has_explicit_slur_with_extra_words("b4dw0rd789", &extra)); + assert!(has_explicit_slur_with_extra_words("b.4.d.w.0.r.d", &extra)); + assert!(has_explicit_slur_with_extra_words("this contains badword here", &extra)); + assert!(has_explicit_slur_with_extra_words("0ff3n$1v3", &extra)); + + assert!(!has_explicit_slur_with_extra_words("goodword", &extra)); + assert!(!has_explicit_slur_with_extra_words("hello world", &extra)); + } +} diff --git a/src/validation/mod.rs b/src/validation/mod.rs index ef22576..ed0acf8 100644 --- a/src/validation/mod.rs +++ b/src/validation/mod.rs @@ -17,6 +17,8 @@ pub enum ValidationError { InvalidRecord(String), #[error("Unknown record type: {0}")] UnknownType(String), + #[error("Unacceptable slur in record at {path}")] + BannedContent { path: String }, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -52,6 +54,15 @@ impl RecordValidator { &self, record: &Value, collection: &str, + ) -> Result { + self.validate_with_rkey(record, collection, None) + } + + pub fn validate_with_rkey( + &self, + record: &Value, + collection: &str, + rkey: Option<&str>, ) -> Result { let obj = record.as_object().ok_or_else(|| { ValidationError::InvalidRecord("Record must be an object".to_string()) @@ -78,9 +89,10 @@ impl RecordValidator { "app.bsky.graph.block" => self.validate_block(obj)?, "app.bsky.graph.list" => self.validate_list(obj)?, "app.bsky.graph.listitem" => self.validate_list_item(obj)?, - "app.bsky.feed.generator" => self.validate_feed_generator(obj)?, + "app.bsky.feed.generator" => self.validate_feed_generator(obj, rkey)?, "app.bsky.feed.threadgate" => self.validate_threadgate(obj)?, "app.bsky.labeler.service" => self.validate_labeler_service(obj)?, + "app.bsky.graph.starterpack" => self.validate_starterpack(obj)?, _ => { if self.require_lexicon { return Err(ValidationError::UnknownType(record_type.to_string())); @@ -126,13 +138,39 @@ impl RecordValidator { }); } for (i, tag) in tags.iter().enumerate() { - if let Some(tag_str) = tag.as_str() - && tag_str.len() > 640 - { - return Err(ValidationError::InvalidField { - path: format!("tags/{}", i), - message: "Tag exceeds maximum length of 640 bytes".to_string(), - }); + if let Some(tag_str) = tag.as_str() { + if tag_str.len() > 640 { + return Err(ValidationError::InvalidField { + path: format!("tags/{}", i), + message: "Tag exceeds maximum length of 640 bytes".to_string(), + }); + } + if crate::moderation::has_explicit_slur(tag_str) { + return Err(ValidationError::BannedContent { + path: format!("tags/{}", i), + }); + } + } + } + } + if let Some(facets) = obj.get("facets").and_then(|v| v.as_array()) { + for (i, facet) in facets.iter().enumerate() { + if let Some(features) = facet.get("features").and_then(|v| v.as_array()) { + for (j, feature) in features.iter().enumerate() { + let is_tag = feature + .get("$type") + .and_then(|v| v.as_str()) + .is_some_and(|t| t == "app.bsky.richtext.facet#tag"); + if is_tag { + if let Some(tag) = feature.get("tag").and_then(|v| v.as_str()) { + if crate::moderation::has_explicit_slur(tag) { + return Err(ValidationError::BannedContent { + path: format!("facets/{}/features/{}/tag", i, j), + }); + } + } + } + } } } } @@ -154,6 +192,11 @@ impl RecordValidator { ), }); } + if crate::moderation::has_explicit_slur(display_name) { + return Err(ValidationError::BannedContent { + path: "displayName".to_string(), + }); + } } if let Some(description) = obj.get("description").and_then(|v| v.as_str()) { let grapheme_count = description.chars().count(); @@ -166,6 +209,11 @@ impl RecordValidator { ), }); } + if crate::moderation::has_explicit_slur(description) { + return Err(ValidationError::BannedContent { + path: "description".to_string(), + }); + } } Ok(()) } @@ -238,13 +286,18 @@ impl RecordValidator { if !obj.contains_key("createdAt") { return Err(ValidationError::MissingField("createdAt".to_string())); } - if let Some(name) = obj.get("name").and_then(|v| v.as_str()) - && (name.is_empty() || name.len() > 64) - { - return Err(ValidationError::InvalidField { - path: "name".to_string(), - message: "Name must be 1-64 characters".to_string(), - }); + if let Some(name) = obj.get("name").and_then(|v| v.as_str()) { + if name.is_empty() || name.len() > 64 { + return Err(ValidationError::InvalidField { + path: "name".to_string(), + message: "Name must be 1-64 characters".to_string(), + }); + } + if crate::moderation::has_explicit_slur(name) { + return Err(ValidationError::BannedContent { + path: "name".to_string(), + }); + } } Ok(()) } @@ -268,6 +321,7 @@ impl RecordValidator { fn validate_feed_generator( &self, obj: &serde_json::Map, + rkey: Option<&str>, ) -> Result<(), ValidationError> { if !obj.contains_key("did") { return Err(ValidationError::MissingField("did".to_string())); @@ -278,13 +332,64 @@ impl RecordValidator { if !obj.contains_key("createdAt") { return Err(ValidationError::MissingField("createdAt".to_string())); } - if let Some(display_name) = obj.get("displayName").and_then(|v| v.as_str()) - && (display_name.is_empty() || display_name.len() > 240) - { - return Err(ValidationError::InvalidField { - path: "displayName".to_string(), - message: "displayName must be 1-240 characters".to_string(), - }); + if let Some(rkey) = rkey { + if crate::moderation::has_explicit_slur(rkey) { + return Err(ValidationError::BannedContent { + path: "rkey".to_string(), + }); + } + } + if let Some(display_name) = obj.get("displayName").and_then(|v| v.as_str()) { + if display_name.is_empty() || display_name.len() > 240 { + return Err(ValidationError::InvalidField { + path: "displayName".to_string(), + message: "displayName must be 1-240 characters".to_string(), + }); + } + if crate::moderation::has_explicit_slur(display_name) { + return Err(ValidationError::BannedContent { + path: "displayName".to_string(), + }); + } + } + Ok(()) + } + + fn validate_starterpack( + &self, + obj: &serde_json::Map, + ) -> Result<(), ValidationError> { + if !obj.contains_key("name") { + return Err(ValidationError::MissingField("name".to_string())); + } + if !obj.contains_key("createdAt") { + return Err(ValidationError::MissingField("createdAt".to_string())); + } + if let Some(name) = obj.get("name").and_then(|v| v.as_str()) { + if name.is_empty() || name.len() > 500 { + return Err(ValidationError::InvalidField { + path: "name".to_string(), + message: "name must be 1-500 characters".to_string(), + }); + } + if crate::moderation::has_explicit_slur(name) { + return Err(ValidationError::BannedContent { + path: "name".to_string(), + }); + } + } + if let Some(description) = obj.get("description").and_then(|v| v.as_str()) { + if description.len() > 3000 { + return Err(ValidationError::InvalidField { + path: "description".to_string(), + message: "description must be at most 3000 characters".to_string(), + }); + } + if crate::moderation::has_explicit_slur(description) { + return Err(ValidationError::BannedContent { + path: "description".to_string(), + }); + } } Ok(()) } diff --git a/tests/banned_words.rs b/tests/banned_words.rs new file mode 100644 index 0000000..66e192b --- /dev/null +++ b/tests/banned_words.rs @@ -0,0 +1,196 @@ +/* + * CONTENT WARNING + * + * This file contains explicit slurs and hateful language. We're sorry you have to see them. + * + * These words exist here for one reason: to ensure our moderation system correctly blocks them. + * We can't verify the filter catches the n-word without testing against the actual word. + * Euphemisms wouldn't prove the protection works. + * + * If reading this file has caused you distress, please know: + * - you are valued and welcome in this community + * - these words do not reflect the views of this project or its contributors + * - we maintain this code precisely because we believe everyone deserves an experience on the web that is free from this kinda language +*/ + +mod common; +mod helpers; +use common::*; +use helpers::*; +use reqwest::StatusCode; +use serde_json::json; + +#[tokio::test] +async fn test_handle_with_slur_rejected() { + let client = client(); + let timestamp = chrono::Utc::now().timestamp_millis(); + let offensive_handle = format!("nigger{}", timestamp); + + let create_payload = json!({ + "handle": offensive_handle, + "email": format!("test{}@example.com", timestamp), + "password": "TestPassword123!" + }); + + let res = client + .post(format!( + "{}/xrpc/com.atproto.server.createAccount", + base_url().await + )) + .json(&create_payload) + .send() + .await + .expect("Request failed"); + + assert_eq!(res.status(), StatusCode::BAD_REQUEST); + let body: serde_json::Value = res.json().await.unwrap(); + assert_eq!(body["error"], "InvalidHandle"); + assert!(body["message"] + .as_str() + .unwrap_or("") + .contains("Inappropriate language")); +} + +#[tokio::test] +async fn test_handle_with_normalized_slur_rejected() { + let client = client(); + let timestamp = chrono::Utc::now().timestamp_millis(); + let offensive_handle = format!("n-i-g-g-e-r{}", timestamp); + + let create_payload = json!({ + "handle": offensive_handle, + "email": format!("test{}@example.com", timestamp), + "password": "TestPassword123!" + }); + + let res = client + .post(format!( + "{}/xrpc/com.atproto.server.createAccount", + base_url().await + )) + .json(&create_payload) + .send() + .await + .expect("Request failed"); + + assert_eq!(res.status(), StatusCode::BAD_REQUEST); + let body: serde_json::Value = res.json().await.unwrap(); + assert_eq!(body["error"], "InvalidHandle"); +} + +#[tokio::test] +async fn test_handle_update_with_slur_rejected() { + let client = client(); + let (_, jwt) = setup_new_user("handleupdate").await; + + let update_payload = json!({ + "handle": "faggots" + }); + + let res = client + .post(format!( + "{}/xrpc/com.atproto.identity.updateHandle", + base_url().await + )) + .bearer_auth(&jwt) + .json(&update_payload) + .send() + .await + .expect("Request failed"); + + assert_eq!(res.status(), StatusCode::BAD_REQUEST); + let body: serde_json::Value = res.json().await.unwrap(); + assert_eq!(body["error"], "InvalidHandle"); +} + +#[tokio::test] +async fn test_profile_displayname_with_slur_rejected() { + let client = client(); + let (did, jwt) = setup_new_user("profileslur").await; + + let profile = json!({ + "repo": did, + "collection": "app.bsky.actor.profile", + "rkey": "self", + "record": { + "$type": "app.bsky.actor.profile", + "displayName": "I am a kike" + } + }); + + let res = client + .post(format!( + "{}/xrpc/com.atproto.repo.putRecord", + base_url().await + )) + .bearer_auth(&jwt) + .json(&profile) + .send() + .await + .expect("Request failed"); + + assert_eq!(res.status(), StatusCode::BAD_REQUEST); + let body: serde_json::Value = res.json().await.unwrap(); + assert_eq!(body["error"], "InvalidRecord"); +} + +#[tokio::test] +async fn test_profile_description_with_slur_rejected() { + let client = client(); + let (did, jwt) = setup_new_user("profiledesc").await; + + let profile = json!({ + "repo": did, + "collection": "app.bsky.actor.profile", + "rkey": "self", + "record": { + "$type": "app.bsky.actor.profile", + "displayName": "Normal Name", + "description": "I hate all chinks" + } + }); + + let res = client + .post(format!( + "{}/xrpc/com.atproto.repo.putRecord", + base_url().await + )) + .bearer_auth(&jwt) + .json(&profile) + .send() + .await + .expect("Request failed"); + + assert_eq!(res.status(), StatusCode::BAD_REQUEST); + let body: serde_json::Value = res.json().await.unwrap(); + assert_eq!(body["error"], "InvalidRecord"); +} + +#[tokio::test] +async fn test_clean_content_allowed() { + let client = client(); + let (did, jwt) = setup_new_user("cleanpost").await; + + let post = json!({ + "repo": did, + "collection": "app.bsky.feed.post", + "record": { + "$type": "app.bsky.feed.post", + "text": "This is a perfectly normal post about coding and technology!", + "createdAt": chrono::Utc::now().to_rfc3339() + } + }); + + let res = client + .post(format!( + "{}/xrpc/com.atproto.repo.createRecord", + base_url().await + )) + .bearer_auth(&jwt) + .json(&post) + .send() + .await + .expect("Request failed"); + + assert_eq!(res.status(), StatusCode::OK); +}