Skip to content

Commit 17d1bed

Browse files
committed
tokenize some regexp patterns
1 parent fca4bc5 commit 17d1bed

File tree

3 files changed

+28
-4
lines changed

3 files changed

+28
-4
lines changed

src/filters/network.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,7 @@ pub enum FilterPart {
311311
AnyOf(Vec<String>),
312312
}
313313

314+
#[derive(Debug, PartialEq)]
314315
pub enum FilterTokens {
315316
Empty,
316317
OptDomains(Vec<Hash>),
@@ -929,6 +930,17 @@ impl NetworkFilter {
929930
if let Some(hostname) = self.hostname.as_ref() {
930931
utils::tokenize_to(hostname, &mut tokens);
931932
}
933+
} else if let Some(hostname) = self.hostname.as_ref() {
934+
// For hostname regex patterns, try to extract literal tokens from the beginning
935+
// Only tokenize if it looks like <literal>.<pattern> where we can extract the literal part
936+
if let Some(dot_pos) = hostname.find('.') {
937+
let prefix = &hostname[..dot_pos];
938+
// Allow alphanumeric and hyphens (valid hostname chars)
939+
// Minimum 2 chars to avoid overly generic tokens
940+
if prefix.len() >= 2 && prefix.chars().all(|c| c.is_alphanumeric() || c == '-') {
941+
tokens.push(utils::fast_hash(prefix));
942+
}
943+
}
932944
}
933945

934946
if tokens.is_empty() && self.mask.contains(NetworkFilterMask::IS_REMOVEPARAM) {

tests/unit/engine.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ mod tests {
183183
fn deserialization_generate_simple() {
184184
let mut engine = Engine::from_rules(["ad-banner"], Default::default());
185185
let data = engine.serialize().to_vec();
186-
const EXPECTED_HASH: u64 = 10945714988765761881;
186+
const EXPECTED_HASH: u64 = 4966705697069852975;
187187
assert_eq!(hash(&data), EXPECTED_HASH, "{HASH_MISMATCH_MSG}");
188188
engine.deserialize(&data).unwrap();
189189
}
@@ -193,7 +193,7 @@ mod tests {
193193
let mut engine = Engine::from_rules(["ad-banner$tag=abc"], Default::default());
194194
engine.use_tags(&["abc"]);
195195
let data = engine.serialize().to_vec();
196-
const EXPECTED_HASH: u64 = 4608037684406751718;
196+
const EXPECTED_HASH: u64 = 13197424322352491802;
197197
assert_eq!(hash(&data), EXPECTED_HASH, "{HASH_MISMATCH_MSG}");
198198
engine.deserialize(&data).unwrap();
199199
}
@@ -237,9 +237,9 @@ mod tests {
237237
);
238238
}
239239
let expected_hash: u64 = if cfg!(feature = "css-validation") {
240-
9439492009815519037
240+
9241597848569736555
241241
} else {
242-
14803842039735157685
242+
15978625802684888070
243243
};
244244

245245
assert_eq!(hash(&data), expected_hash, "{HASH_MISMATCH_MSG}");

tests/unit/filters/network.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1186,4 +1186,16 @@ mod parse_tests {
11861186
defaults.opt_domains = Some(vec![utils::fast_hash("auth.wi-fi.ru")]);
11871187
assert_eq!(defaults, NetworkFilterBreakdown::from(&filter));
11881188
}
1189+
1190+
#[test]
1191+
fn test_simple_pattern_tokenization() {
1192+
let rule = "||primewire.*/sw$script,1p";
1193+
let filter =
1194+
NetworkFilter::parse(rule, true, crate::lists::ParseOptions::default()).unwrap();
1195+
let tokens = filter.get_tokens_optimized();
1196+
assert_eq!(
1197+
tokens,
1198+
crate::filters::network::FilterTokens::Other(vec![utils::fast_hash("primewire")])
1199+
);
1200+
}
11891201
}

0 commit comments

Comments
 (0)