diff --git a/data/botPolicies.yaml b/data/botPolicies.yaml index 5e444fe54..308ba2cbe 100644 --- a/data/botPolicies.yaml +++ b/data/botPolicies.yaml @@ -11,51 +11,8 @@ ## /usr/share/docs/anubis/data or in the tarball you extracted Anubis from. bots: - # Pathological bots to deny - - # This correlates to data/bots/deny-pathological.yaml in the source tree - # https://github.com/TecharoHQ/anubis/blob/main/data/bots/deny-pathological.yaml - import: (data)/bots/_deny-pathological.yaml - - import: (data)/bots/aggressive-brazilian-scrapers.yaml - - # Aggressively block AI/LLM related bots/agents by default - - import: (data)/meta/ai-block-aggressive.yaml - - # Consider replacing the aggressive AI policy with more selective policies: - # - import: (data)/meta/ai-block-moderate.yaml - # - import: (data)/meta/ai-block-permissive.yaml - - # Search engine crawlers to allow, defaults to: - # - Google (so they don't try to bypass Anubis) - # - Apple - # - Bing - # - DuckDuckGo - # - Qwant - # - The Internet Archive - # - Kagi - # - Marginalia - # - Mojeek - - import: (data)/crawlers/_allow-good.yaml - # Challenge Firefox AI previews - - import: (data)/clients/x-firefox-ai.yaml - - # Allow common "keeping the internet working" routes (well-known, favicon, robots.txt) - - import: (data)/common/keep-internet-working.yaml - - # # Punish any bot with "bot" in the user-agent string - # # This is known to have a high false-positive rate, use at your own risk - # - name: generic-bot-catchall - # user_agent_regex: (?i:bot|crawler) - # action: CHALLENGE - # challenge: - # difficulty: 16 # impossible - # report_as: 4 # lie to the operator - # algorithm: slow # intentionally waste CPU cycles and time - - # Generic catchall rule - - name: generic-browser - user_agent_regex: >- - Mozilla|Opera - action: CHALLENGE + - # load the default rules + import: (data)/bots.yaml dnsbl: false diff --git a/data/bots.yaml b/data/bots.yaml new file mode 100644 index 000000000..7f4f201d7 --- /dev/null +++ b/data/bots.yaml @@ -0,0 +1,45 @@ +# Pathological bots to deny +- # This correlates to data/bots/deny-pathological.yaml in the source tree + # https://github.com/TecharoHQ/anubis/blob/main/data/bots/deny-pathological.yaml + import: (data)/bots/_deny-pathological.yaml +- import: (data)/bots/aggressive-brazilian-scrapers.yaml + +# Aggressively block AI/LLM related bots/agents by default +- import: (data)/meta/ai-block-aggressive.yaml + +# Consider replacing the aggressive AI policy with more selective policies: +# - import: (data)/meta/ai-block-moderate.yaml +# - import: (data)/meta/ai-block-permissive.yaml + +# Search engine crawlers to allow, defaults to: +# - Google (so they don't try to bypass Anubis) +# - Apple +# - Bing +# - DuckDuckGo +# - Qwant +# - The Internet Archive +# - Kagi +# - Marginalia +# - Mojeek +- import: (data)/crawlers/_allow-good.yaml +# Challenge Firefox AI previews +- import: (data)/clients/x-firefox-ai.yaml + +# Allow common "keeping the internet working" routes (well-known, favicon, robots.txt) +- import: (data)/common/keep-internet-working.yaml + +# # Punish any bot with "bot" in the user-agent string +# # This is known to have a high false-positive rate, use at your own risk +# - name: generic-bot-catchall +# user_agent_regex: (?i:bot|crawler) +# action: CHALLENGE +# challenge: +# difficulty: 16 # impossible +# report_as: 4 # lie to the operator +# algorithm: slow # intentionally waste CPU cycles and time + +# Generic catchall rule +- name: generic-browser + user_agent_regex: >- + Mozilla|Opera + action: CHALLENGE