Skip to content

Commit a46ae96

Browse files
authored
allow country TLDs in scheme-less links (#106)
* allow country TLDs in scheme-less links * fix fmt * fix manual ts types
1 parent f0a70d1 commit a46ae96

File tree

7 files changed

+73
-12
lines changed

7 files changed

+73
-12
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
## Unreleased
44

5+
- allow country TLDs in scheme-less links
6+
57
## 0.14.0 - Bug fixes and scheme-less links
68

79
- Parse scheme-less links for some TLDs

message_parser_wasm/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ export type LinkDestination = {
4141
target: string;
4242
hostname: null | string;
4343
punycode: null | PunycodeWarning;
44+
scheme: null | string;
4445
};
4546
export type ParsedElement =
4647
| { t: "Text"; c: string }
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// extracted from first table on https://de.wikipedia.org/wiki/Liste_länderspezifischer_Top-Level-Domains
2+
pub const COUNTRY_TLDS: [&str; 254] = [
3+
"ac", "ad", "ae", "af", "ag", "ai", "al", "am", "ao", "aq", "ar", "as", "at", "au", "aw", "ax",
4+
"az", "ba", "bb", "bd", "be", "bf", "bg", "bh", "bi", "bj", "bl", "bm", "bn", "bo", "bq", "br",
5+
"bs", "bt", "bv", "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm",
6+
"cn", "co", "cr", "cu", "cv", "cw", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec",
7+
"ee", "eg", "eh", "er", "es", "et", "eu", "fi", "fj", "fk", "fm", "fo", "fr", "ga", "gb", "gd",
8+
"ge", "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", "gr", "gs", "gt", "gu", "gw", "gy",
9+
"hk", "hm", "hn", "hr", "ht", "hu", "id", "ie", "il", "im", "in", "io", "iq", "ir", "is", "it",
10+
"je", "jm", "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz", "la",
11+
"lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "mc", "md", "me", "mf", "mg",
12+
"mh", "mk", "ml", "mm", "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my",
13+
"mz", "na", "nc", "ne", "nf", "ng", "ni", "nl", "no", "np", "nr", "nu", "nz", "om", "pa", "pe",
14+
"pf", "pg", "ph", "pk", "pl", "pm", "pn", "pr", "ps", "pt", "pw", "py", "qa", "re", "ro", "rs",
15+
"ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sj", "sk", "sl", "sm", "sn", "so",
16+
"sr", "ss", "st", "su", "sv", "sx", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj", "tk", "tl",
17+
"tm", "tn", "to", "tp", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz", "va",
18+
"vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt", "za", "zm", "zr", "zw",
19+
];
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
mod country_tlds;
2+
3+
const ALLOWED_TOP_LEVEL_DOMAINS: &[&str] = &[
4+
// originals from RFC920 + net
5+
"com", "org", "net", "edu", "gov", "mil", // for deltachat
6+
"chat",
7+
];
8+
9+
pub fn check_if_tld_is_allowed(tld: &str) -> bool {
10+
if ALLOWED_TOP_LEVEL_DOMAINS.iter().any(|item| *item == tld) {
11+
true
12+
} else {
13+
country_tlds::COUNTRY_TLDS.binary_search(&tld).is_ok()
14+
}
15+
}
16+
17+
#[cfg(test)]
18+
mod test {
19+
use crate::parser::link_url::allowed_tlds::check_if_tld_is_allowed;
20+
21+
#[test]
22+
fn test_check_tld() {
23+
assert!(check_if_tld_is_allowed("chat"));
24+
assert!(check_if_tld_is_allowed("com"));
25+
26+
assert!(check_if_tld_is_allowed("de"));
27+
assert!(check_if_tld_is_allowed("at"));
28+
assert!(check_if_tld_is_allowed("uk"));
29+
assert!(check_if_tld_is_allowed("fr"));
30+
}
31+
32+
#[test]
33+
fn test_check_tld_not_allowed() {
34+
assert!(!check_if_tld_is_allowed("doesnotexist"));
35+
}
36+
}

src/parser/link_url/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
mod allowed_tlds;
12
mod ip;
23
mod parenthesis_counter;
34
mod parse_link;

src/parser/link_url/parse_link.rs

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use crate::parser::{
2323
};
2424

2525
use super::{
26+
allowed_tlds::check_if_tld_is_allowed,
2627
parenthesis_counter::count_chars_in_complete_parenthesis,
2728
punycode_warning::get_puny_code_warning,
2829
};
@@ -46,14 +47,6 @@ fn is_allowed_generic_scheme(scheme: &str) -> bool {
4647
)
4748
}
4849

49-
const ALLOWED_TOP_LEVEL_DOMAINS: &[&str] = &[
50-
// originals from RFC920 + net
51-
".com", ".org", ".net", ".edu", ".gov", ".mil",
52-
// for deltachat
53-
".chat",
54-
// !todo country codes here next
55-
];
56-
5750
// These ranges have been extracted from RFC3987, Page 8.
5851
const UCSCHAR_RANGES: [RangeInclusive<u32>; 17] = [
5952
0xa0..=0xd7ff,
@@ -294,10 +287,18 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> {
294287

295288
// now with host, if we dont have a scheme we need to check it for TLD
296289
if scheme.is_empty() {
297-
ALLOWED_TOP_LEVEL_DOMAINS
298-
.iter()
299-
.find(|&&tld| host.ends_with(tld))
300-
.ok_or(nom::Err::Failure(CustomError::<&str>::InvalidLink))?;
290+
if !host.contains('.') {
291+
return Err(nom::Err::Failure(CustomError::<&str>::InvalidLink));
292+
}
293+
294+
let tld = host
295+
.split('.')
296+
.last()
297+
.ok_or(nom::Err::Failure(CustomError::<&str>::InvalidLinkNoTLD))?;
298+
299+
if !check_if_tld_is_allowed(tld) {
300+
return Err(nom::Err::Failure(CustomError::<&str>::InvalidLink));
301+
}
301302
}
302303

303304
let (input, path) = opt(alt((

src/parser/parse_from_text/base_parsers.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ pub enum CustomError<I> {
1818
Nom(I, ErrorKind),
1919
InvalidEmail,
2020
InvalidLink,
21+
InvalidLinkNoTLD,
2122
UnexpectedContent,
2223
PrecedingWhitespaceMissing,
2324
OptionIsUnexpectedNone,

0 commit comments

Comments
 (0)