From d6d879133e67aa967d849a0b73ddde25ddd4bb54 Mon Sep 17 00:00:00 2001 From: Remilia Da Costa Faro Date: Fri, 21 Mar 2025 20:39:34 +0100 Subject: [PATCH] Allow filtering by remote addresses (#52) * Added the possibility to define rules for remote addresses * Added change in changelog * Added check for X-Real-Ip and X-Forwarded-For when checking for remote address filtering * cmd/anubis: refine IP filtering logic * Optimize the configuration so that the IP trie is created once at application start instead of dynamically being created every request. * Document the changes in the changelog and docs site. * Allow pure IP range filtering. * Allow user agent based IP range filtering. * Allow path based IP range filtering. * Create --debug-x-real-ip-default flag for testing Anubis locally without a HTTP load balancer. --------- Co-authored-by: Xe Iaso --- cmd/anubis/botPolicies.json | 318 +++++++++++++++++- cmd/anubis/internal/config/config.go | 15 +- cmd/anubis/internal/config/config_test.go | 38 +++ .../config/testdata/good/allow_everyone.json | 12 + cmd/anubis/main.go | 59 +++- cmd/anubis/policy.go | 60 +++- docs/docs/CHANGELOG.md | 20 ++ docs/docs/admin/policies.md | 29 ++ go.mod | 1 + go.sum | 8 + internal/headers.go | 21 +- 11 files changed, 554 insertions(+), 27 deletions(-) create mode 100644 cmd/anubis/internal/config/testdata/good/allow_everyone.json diff --git a/cmd/anubis/botPolicies.json b/cmd/anubis/botPolicies.json index b602ef7..a93dbbb 100644 --- a/cmd/anubis/botPolicies.json +++ b/cmd/anubis/botPolicies.json @@ -8,22 +8,332 @@ { "name": "googlebot", "user_agent_regex": "\\+http\\:\\/\\/www\\.google\\.com/bot\\.html", - "action": "ALLOW" + "action": "ALLOW", + "remote_addresses": [ + "2001:4860:4801:10::/64", + "2001:4860:4801:11::/64", + "2001:4860:4801:12::/64", + "2001:4860:4801:13::/64", + "2001:4860:4801:14::/64", + "2001:4860:4801:15::/64", + "2001:4860:4801:16::/64", + "2001:4860:4801:17::/64", + "2001:4860:4801:18::/64", + "2001:4860:4801:19::/64", + "2001:4860:4801:1a::/64", + "2001:4860:4801:1b::/64", + "2001:4860:4801:1c::/64", + "2001:4860:4801:1d::/64", + "2001:4860:4801:1e::/64", + "2001:4860:4801:1f::/64", + "2001:4860:4801:20::/64", + "2001:4860:4801:21::/64", + "2001:4860:4801:22::/64", + "2001:4860:4801:23::/64", + "2001:4860:4801:24::/64", + "2001:4860:4801:25::/64", + "2001:4860:4801:26::/64", + "2001:4860:4801:27::/64", + "2001:4860:4801:28::/64", + "2001:4860:4801:29::/64", + "2001:4860:4801:2::/64", + "2001:4860:4801:2a::/64", + "2001:4860:4801:2b::/64", + "2001:4860:4801:2c::/64", + "2001:4860:4801:2d::/64", + "2001:4860:4801:2e::/64", + "2001:4860:4801:2f::/64", + "2001:4860:4801:31::/64", + "2001:4860:4801:32::/64", + "2001:4860:4801:33::/64", + "2001:4860:4801:34::/64", + "2001:4860:4801:35::/64", + "2001:4860:4801:36::/64", + "2001:4860:4801:37::/64", + "2001:4860:4801:38::/64", + "2001:4860:4801:39::/64", + "2001:4860:4801:3a::/64", + "2001:4860:4801:3b::/64", + "2001:4860:4801:3c::/64", + "2001:4860:4801:3d::/64", + "2001:4860:4801:3e::/64", + "2001:4860:4801:40::/64", + "2001:4860:4801:41::/64", + "2001:4860:4801:42::/64", + "2001:4860:4801:43::/64", + "2001:4860:4801:44::/64", + "2001:4860:4801:45::/64", + "2001:4860:4801:46::/64", + "2001:4860:4801:47::/64", + "2001:4860:4801:48::/64", + "2001:4860:4801:49::/64", + "2001:4860:4801:4a::/64", + "2001:4860:4801:4b::/64", + "2001:4860:4801:4c::/64", + "2001:4860:4801:50::/64", + "2001:4860:4801:51::/64", + "2001:4860:4801:52::/64", + "2001:4860:4801:53::/64", + "2001:4860:4801:54::/64", + "2001:4860:4801:55::/64", + "2001:4860:4801:56::/64", + "2001:4860:4801:60::/64", + "2001:4860:4801:61::/64", + "2001:4860:4801:62::/64", + "2001:4860:4801:63::/64", + "2001:4860:4801:64::/64", + "2001:4860:4801:65::/64", + "2001:4860:4801:66::/64", + "2001:4860:4801:67::/64", + "2001:4860:4801:68::/64", + "2001:4860:4801:69::/64", + "2001:4860:4801:6a::/64", + "2001:4860:4801:6b::/64", + "2001:4860:4801:6c::/64", + "2001:4860:4801:6d::/64", + "2001:4860:4801:6e::/64", + "2001:4860:4801:6f::/64", + "2001:4860:4801:70::/64", + "2001:4860:4801:71::/64", + "2001:4860:4801:72::/64", + "2001:4860:4801:73::/64", + "2001:4860:4801:74::/64", + "2001:4860:4801:75::/64", + "2001:4860:4801:76::/64", + "2001:4860:4801:77::/64", + "2001:4860:4801:78::/64", + "2001:4860:4801:79::/64", + "2001:4860:4801:80::/64", + "2001:4860:4801:81::/64", + "2001:4860:4801:82::/64", + "2001:4860:4801:83::/64", + "2001:4860:4801:84::/64", + "2001:4860:4801:85::/64", + "2001:4860:4801:86::/64", + "2001:4860:4801:87::/64", + "2001:4860:4801:88::/64", + "2001:4860:4801:90::/64", + "2001:4860:4801:91::/64", + "2001:4860:4801:92::/64", + "2001:4860:4801:93::/64", + "2001:4860:4801:94::/64", + "2001:4860:4801:95::/64", + "2001:4860:4801:96::/64", + "2001:4860:4801:a0::/64", + "2001:4860:4801:a1::/64", + "2001:4860:4801:a2::/64", + "2001:4860:4801:a3::/64", + "2001:4860:4801:a4::/64", + "2001:4860:4801:a5::/64", + "2001:4860:4801:c::/64", + "2001:4860:4801:f::/64", + "192.178.5.0/27", + "192.178.6.0/27", + "192.178.6.128/27", + "192.178.6.160/27", + "192.178.6.192/27", + "192.178.6.32/27", + "192.178.6.64/27", + "192.178.6.96/27", + "34.100.182.96/28", + "34.101.50.144/28", + "34.118.254.0/28", + "34.118.66.0/28", + "34.126.178.96/28", + "34.146.150.144/28", + "34.147.110.144/28", + "34.151.74.144/28", + "34.152.50.64/28", + "34.154.114.144/28", + "34.155.98.32/28", + "34.165.18.176/28", + "34.175.160.64/28", + "34.176.130.16/28", + "34.22.85.0/27", + "34.64.82.64/28", + "34.65.242.112/28", + "34.80.50.80/28", + "34.88.194.0/28", + "34.89.10.80/28", + "34.89.198.80/28", + "34.96.162.48/28", + "35.247.243.240/28", + "66.249.64.0/27", + "66.249.64.128/27", + "66.249.64.160/27", + "66.249.64.224/27", + "66.249.64.32/27", + "66.249.64.64/27", + "66.249.64.96/27", + "66.249.65.0/27", + "66.249.65.128/27", + "66.249.65.160/27", + "66.249.65.192/27", + "66.249.65.224/27", + "66.249.65.32/27", + "66.249.65.64/27", + "66.249.65.96/27", + "66.249.66.0/27", + "66.249.66.128/27", + "66.249.66.160/27", + "66.249.66.192/27", + "66.249.66.224/27", + "66.249.66.32/27", + "66.249.66.64/27", + "66.249.66.96/27", + "66.249.68.0/27", + "66.249.68.128/27", + "66.249.68.32/27", + "66.249.68.64/27", + "66.249.68.96/27", + "66.249.69.0/27", + "66.249.69.128/27", + "66.249.69.160/27", + "66.249.69.192/27", + "66.249.69.224/27", + "66.249.69.32/27", + "66.249.69.64/27", + "66.249.69.96/27", + "66.249.70.0/27", + "66.249.70.128/27", + "66.249.70.160/27", + "66.249.70.192/27", + "66.249.70.224/27", + "66.249.70.32/27", + "66.249.70.64/27", + "66.249.70.96/27", + "66.249.71.0/27", + "66.249.71.128/27", + "66.249.71.160/27", + "66.249.71.192/27", + "66.249.71.224/27", + "66.249.71.32/27", + "66.249.71.64/27", + "66.249.71.96/27", + "66.249.72.0/27", + "66.249.72.128/27", + "66.249.72.160/27", + "66.249.72.192/27", + "66.249.72.224/27", + "66.249.72.32/27", + "66.249.72.64/27", + "66.249.72.96/27", + "66.249.73.0/27", + "66.249.73.128/27", + "66.249.73.160/27", + "66.249.73.192/27", + "66.249.73.224/27", + "66.249.73.32/27", + "66.249.73.64/27", + "66.249.73.96/27", + "66.249.74.0/27", + "66.249.74.128/27", + "66.249.74.160/27", + "66.249.74.192/27", + "66.249.74.32/27", + "66.249.74.64/27", + "66.249.74.96/27", + "66.249.75.0/27", + "66.249.75.128/27", + "66.249.75.160/27", + "66.249.75.192/27", + "66.249.75.224/27", + "66.249.75.32/27", + "66.249.75.64/27", + "66.249.75.96/27", + "66.249.76.0/27", + "66.249.76.128/27", + "66.249.76.160/27", + "66.249.76.192/27", + "66.249.76.224/27", + "66.249.76.32/27", + "66.249.76.64/27", + "66.249.76.96/27", + "66.249.77.0/27", + "66.249.77.128/27", + "66.249.77.160/27", + "66.249.77.192/27", + "66.249.77.224/27", + "66.249.77.32/27", + "66.249.77.64/27", + "66.249.77.96/27", + "66.249.78.0/27", + "66.249.78.32/27", + "66.249.79.0/27", + "66.249.79.128/27", + "66.249.79.160/27", + "66.249.79.192/27", + "66.249.79.224/27", + "66.249.79.32/27", + "66.249.79.64/27", + "66.249.79.96/27" + ] }, { "name": "bingbot", "user_agent_regex": "\\+http\\:\\/\\/www\\.bing\\.com/bingbot\\.htm", - "action": "ALLOW" + "action": "ALLOW", + "remote_addresses": [ + "157.55.39.0/24", + "207.46.13.0/24", + "40.77.167.0/24", + "13.66.139.0/24", + "13.66.144.0/24", + "52.167.144.0/24", + "13.67.10.16/28", + "13.69.66.240/28", + "13.71.172.224/28", + "139.217.52.0/28", + "191.233.204.224/28", + "20.36.108.32/28", + "20.43.120.16/28", + "40.79.131.208/28", + "40.79.186.176/28", + "52.231.148.0/28", + "20.79.107.240/28", + "51.105.67.0/28", + "20.125.163.80/28", + "40.77.188.0/22", + "65.55.210.0/24", + "199.30.24.0/23", + "40.77.202.0/24", + "40.77.139.0/25", + "20.74.197.0/28", + "20.15.133.160/27", + "40.77.177.0/24", + "40.77.178.0/23" + ] }, { "name": "qwantbot", "user_agent_regex": "\\+https\\:\\/\\/help\\.qwant\\.com/bot/", - "action": "ALLOW" + "action": "ALLOW", + "remote_addresses": [ + "91.242.162.0/24" + ] }, { "name": "kagibot", "user_agent_regex": "\\+https\\:\\/\\/kagi\\.com/bot", - "action": "ALLOW" + "action": "ALLOW", + "remote_addresses": [ + "216.18.205.234/32", + "35.212.27.76/32", + "104.254.65.50/32", + "209.151.156.194/32" + ] + }, + { + "name": "marginalia", + "user_agent_regex": "search\\.marginalia\\.nu", + "action": "ALLOW", + "remote_addresses": [ + "193.183.0.162/31", + "193.183.0.164/30", + "193.183.0.168/30", + "193.183.0.172/31", + "193.183.0.174/32" + ] }, { "name": "us-artificial-intelligence-scraper", diff --git a/cmd/anubis/internal/config/config.go b/cmd/anubis/internal/config/config.go index efd8496..56975af 100644 --- a/cmd/anubis/internal/config/config.go +++ b/cmd/anubis/internal/config/config.go @@ -3,6 +3,7 @@ package config import ( "errors" "fmt" + "net" "regexp" ) @@ -28,17 +29,19 @@ type Bot struct { UserAgentRegex *string `json:"user_agent_regex"` PathRegex *string `json:"path_regex"` Action Rule `json:"action"` + RemoteAddr []string `json:"remote_addresses"` Challenge *ChallengeRules `json:"challenge,omitempty"` } var ( ErrNoBotRulesDefined = errors.New("config: must define at least one (1) bot rule") ErrBotMustHaveName = errors.New("config.Bot: must set name") - ErrBotMustHaveUserAgentOrPath = errors.New("config.Bot: must set either user_agent_regex, path_regex") + ErrBotMustHaveUserAgentOrPath = errors.New("config.Bot: must set either user_agent_regex, path_regex, or remote_addresses") ErrBotMustHaveUserAgentOrPathNotBoth = errors.New("config.Bot: must set either user_agent_regex, path_regex, and not both") ErrUnknownAction = errors.New("config.Bot: unknown action") ErrInvalidUserAgentRegex = errors.New("config.Bot: invalid user agent regex") ErrInvalidPathRegex = errors.New("config.Bot: invalid path regex") + ErrInvalidCIDR = errors.New("config.Bot: invalid CIDR") ) func (b Bot) Valid() error { @@ -48,7 +51,7 @@ func (b Bot) Valid() error { errs = append(errs, ErrBotMustHaveName) } - if b.UserAgentRegex == nil && b.PathRegex == nil { + if b.UserAgentRegex == nil && b.PathRegex == nil && (b.RemoteAddr == nil || len(b.RemoteAddr) == 0) { errs = append(errs, ErrBotMustHaveUserAgentOrPath) } @@ -68,6 +71,14 @@ func (b Bot) Valid() error { } } + if b.RemoteAddr != nil && len(b.RemoteAddr) > 0 { + for _, cidr := range b.RemoteAddr { + if _, _, err := net.ParseCIDR(cidr); err != nil { + errs = append(errs, ErrInvalidCIDR, err) + } + } + } + switch b.Action { case RuleAllow, RuleChallenge, RuleDeny: // okay diff --git a/cmd/anubis/internal/config/config_test.go b/cmd/anubis/internal/config/config_test.go index 0903fbb..9865860 100644 --- a/cmd/anubis/internal/config/config_test.go +++ b/cmd/anubis/internal/config/config_test.go @@ -129,6 +129,44 @@ func TestBotValid(t *testing.T) { }, err: ErrChallengeRuleHasWrongAlgorithm, }, + { + name: "invalid cidr range", + bot: Bot{ + Name: "mozilla-ua", + Action: RuleAllow, + RemoteAddr: []string{"0.0.0.0/33"}, + }, + err: ErrInvalidCIDR, + }, + { + name: "only filter by IP range", + bot: Bot{ + Name: "mozilla-ua", + Action: RuleAllow, + RemoteAddr: []string{"0.0.0.0/0"}, + }, + err: nil, + }, + { + name: "filter by user agent and IP range", + bot: Bot{ + Name: "mozilla-ua", + Action: RuleAllow, + UserAgentRegex: p("Mozilla"), + RemoteAddr: []string{"0.0.0.0/0"}, + }, + err: nil, + }, + { + name: "filter by path and IP range", + bot: Bot{ + Name: "mozilla-ua", + Action: RuleAllow, + PathRegex: p("^.*$"), + RemoteAddr: []string{"0.0.0.0/0"}, + }, + err: nil, + }, } for _, cs := range tests { diff --git a/cmd/anubis/internal/config/testdata/good/allow_everyone.json b/cmd/anubis/internal/config/testdata/good/allow_everyone.json new file mode 100644 index 0000000..a7e1af7 --- /dev/null +++ b/cmd/anubis/internal/config/testdata/good/allow_everyone.json @@ -0,0 +1,12 @@ +{ + "bots": [ + { + "name": "everyones-invited", + "remote_addresses": [ + "0.0.0.0/0", + "::/0" + ], + "action": "ALLOW" + } + ] +} \ No newline at end of file diff --git a/cmd/anubis/main.go b/cmd/anubis/main.go index b92b591..3982e37 100644 --- a/cmd/anubis/main.go +++ b/cmd/anubis/main.go @@ -53,6 +53,7 @@ var ( slogLevel = flag.String("slog-level", "INFO", "logging level (see https://pkg.go.dev/log/slog#hdr-Levels)") target = flag.String("target", "http://localhost:3923", "target to reverse proxy to") healthcheck = flag.Bool("healthcheck", false, "run a health check against Anubis") + debugXRealIPDefault = flag.String("debug-x-real-ip-default", "", "If set, replace empty X-Real-Ip headers with this value, useful only for debugging Anubis and running it locally") //go:embed static botPolicies.json static embed.FS @@ -210,9 +211,21 @@ func main() { mux.HandleFunc("/", s.maybeReverseProxy) - srv := http.Server{Handler: mux} + var h http.Handler + h = mux + h = internal.DefaultXRealIP(*debugXRealIPDefault, h) + + srv := http.Server{Handler: h} listener, url := setupListener(*bindNetwork, *bind) - slog.Info("listening", "url", url, "difficulty", *challengeDifficulty, "serveRobotsTXT", *robotsTxt, "target", *target, "version", anubis.Version) + slog.Info( + "listening", + "url", url, + "difficulty", *challengeDifficulty, + "serveRobotsTXT", *robotsTxt, + "target", *target, + "version", anubis.Version, + "debug-x-real-ip-default", *debugXRealIPDefault, + ) go func() { <-ctx.Done() @@ -364,11 +377,7 @@ type Server struct { } func (s *Server) maybeReverseProxy(w http.ResponseWriter, r *http.Request) { - cr, rule := s.check(r) - r.Header.Add("X-Anubis-Rule", cr.Name) - r.Header.Add("X-Anubis-Action", string(cr.Rule)) lg := slog.With( - "check_result", cr, "user_agent", r.UserAgent(), "accept_language", r.Header.Get("Accept-Language"), "priority", r.Header.Get("Priority"), @@ -376,6 +385,17 @@ func (s *Server) maybeReverseProxy(w http.ResponseWriter, r *http.Request) { r.Header.Get("X-Forwarded-For"), "x-real-ip", r.Header.Get("X-Real-Ip"), ) + + cr, rule, err := s.check(r) + if err != nil { + lg.Error("check failed", "err", err) + templ.Handler(base("Oh noes!", errorPage("Internal Server Error: administrator has misconfigured Anubis. Please contact the administrator and ask them to look for the logs around \"maybeReverseProxy\"")), templ.WithStatus(http.StatusInternalServerError)).ServeHTTP(w, r) + return + } + + r.Header.Add("X-Anubis-Rule", cr.Name) + r.Header.Add("X-Anubis-Action", string(cr.Rule)) + lg = lg.With("check_result", cr) policyApplications.WithLabelValues(cr.Name, string(cr.Rule)).Add(1) ip := r.Header.Get("X-Real-Ip") @@ -530,11 +550,22 @@ func (s *Server) renderIndex(w http.ResponseWriter, r *http.Request) { } func (s *Server) makeChallenge(w http.ResponseWriter, r *http.Request) { - cr, rule := s.check(r) - challenge := s.challengeFor(r, rule.Challenge.Difficulty) - lg := slog.With("user_agent", r.UserAgent(), "accept_language", r.Header.Get("Accept-Language"), "priority", r.Header.Get("Priority"), "x-forwarded-for", r.Header.Get("X-Forwarded-For"), "x-real-ip", r.Header.Get("X-Real-Ip")) + cr, rule, err := s.check(r) + if err != nil { + lg.Error("check failed", "err", err) + w.WriteHeader(http.StatusInternalServerError) + json.NewEncoder(w).Encode(struct { + Error string `json:"error"` + }{ + Error: "Internal Server Error: administrator has misconfigured Anubis. Please contact the administrator and ask them to look for the logs around \"makeChallenge\"", + }) + return + } + lg = lg.With("check_result", cr) + challenge := s.challengeFor(r, rule.Challenge.Difficulty) + json.NewEncoder(w).Encode(struct { Challenge string `json:"challenge"` Rules *config.ChallengeRules `json:"rules"` @@ -547,16 +578,22 @@ func (s *Server) makeChallenge(w http.ResponseWriter, r *http.Request) { } func (s *Server) passChallenge(w http.ResponseWriter, r *http.Request) { - cr, rule := s.check(r) lg := slog.With( "user_agent", r.UserAgent(), "accept_language", r.Header.Get("Accept-Language"), "priority", r.Header.Get("Priority"), "x-forwarded-for", r.Header.Get("X-Forwarded-For"), "x-real-ip", r.Header.Get("X-Real-Ip"), - "cr", cr, ) + cr, rule, err := s.check(r) + if err != nil { + lg.Error("check failed", "err", err) + templ.Handler(base("Oh noes!", errorPage("Internal Server Error: administrator has misconfigured Anubis. Please contact the administrator and ask them to look for the logs around \"passChallenge\".")), templ.WithStatus(http.StatusInternalServerError)).ServeHTTP(w, r) + return + } + lg = lg.With("check_result", cr) + nonceStr := r.FormValue("nonce") if nonceStr == "" { clearCookie(w) diff --git a/cmd/anubis/policy.go b/cmd/anubis/policy.go index a637f09..4e594fc 100644 --- a/cmd/anubis/policy.go +++ b/cmd/anubis/policy.go @@ -5,13 +5,16 @@ import ( "errors" "fmt" "io" + "log" "log/slog" + "net" "net/http" "regexp" "github.com/TecharoHQ/anubis/cmd/anubis/internal/config" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + "github.com/yl2chen/cidranger" ) var ( @@ -32,8 +35,9 @@ type Bot struct { Name string UserAgent *regexp.Regexp Path *regexp.Regexp - Action config.Rule + Action config.Rule `json:"action"` Challenge *config.ChallengeRules + Ranger cidranger.Ranger } func (b Bot) Hash() (string, error) { @@ -77,6 +81,19 @@ func parseConfig(fin io.Reader, fname string, defaultDifficulty int) (*ParsedCon Action: b.Action, } + if b.RemoteAddr != nil && len(b.RemoteAddr) > 0 { + parsedBot.Ranger = cidranger.NewPCTrieRanger() + + for _, cidr := range b.RemoteAddr { + _, rng, err := net.ParseCIDR(cidr) + if err != nil { + return nil, fmt.Errorf("[unexpected] range %s not parsing: %w", cidr, err) + } + + parsedBot.Ranger.Insert(cidranger.NewBasicRangerEntry(*rng)) + } + } + if b.UserAgentRegex != nil { userAgent, err := regexp.Compile(*b.UserAgentRegex) if err != nil { @@ -140,18 +157,47 @@ func cr(name string, rule config.Rule) CheckResult { } } +func (s *Server) checkRemoteAddress(b Bot, addr net.IP) bool { + if b.Ranger == nil { + return false + } + + ok, err := b.Ranger.Contains(addr) + if err != nil { + log.Panicf("[unexpected] something very funky is going on, %q does not have a calculable network number: %v", addr.String(), err) + } + + return ok +} + // Check evaluates the list of rules, and returns the result -func (s *Server) check(r *http.Request) (CheckResult, *Bot) { +func (s *Server) check(r *http.Request) (CheckResult, *Bot, error) { + host := r.Header.Get("X-Real-Ip") + if host == "" { + return zilch[CheckResult](), nil, fmt.Errorf("[misconfiguration] X-Real-Ip header is not set") + } + + addr := net.ParseIP(host) + if addr == nil { + return zilch[CheckResult](), nil, fmt.Errorf("[misconfiguration] %q is not an IP address", host) + } + for _, b := range s.policy.Bots { if b.UserAgent != nil { - if b.UserAgent.MatchString(r.UserAgent()) { - return cr("bot/"+b.Name, b.Action), &b + if uaMatch := b.UserAgent.MatchString(r.UserAgent()); uaMatch || (uaMatch && s.checkRemoteAddress(b, addr)) { + return cr("bot/"+b.Name, b.Action), &b, nil } } if b.Path != nil { - if b.Path.MatchString(r.URL.Path) { - return cr("bot/"+b.Name, b.Action), &b + if pathMatch := b.Path.MatchString(r.URL.Path); pathMatch || (pathMatch && s.checkRemoteAddress(b, addr)) { + return cr("bot/"+b.Name, b.Action), &b, nil + } + } + + if b.Ranger != nil { + if s.checkRemoteAddress(b, addr) { + return cr("bot/"+b.Name, b.Action), &b, nil } } } @@ -162,5 +208,5 @@ func (s *Server) check(r *http.Request) (CheckResult, *Bot) { ReportAs: defaultDifficulty, Algorithm: config.AlgorithmFast, }, - } + }, nil } diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md index d88931b..d54cfff 100644 --- a/docs/docs/CHANGELOG.md +++ b/docs/docs/CHANGELOG.md @@ -40,6 +40,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [KagiBot](https://kagi.com/bot) is allowed through the filter [#44](https://github.com/TecharoHQ/anubis/pull/44) - Fixed hang when navigator.hardwareConcurrency is undefined - Support Unix domain sockets [#45](https://github.com/TecharoHQ/anubis/pull/45) +- Allow filtering by remote addresses: + + ```json + { + "name": "qwantbot", + "user_agent_regex": "\\+https\\:\\/\\/help\\.qwant\\.com/bot/", + "action": "ALLOW", + "remote_addresses": ["91.242.162.0/24"] + } + ``` + + This also works at an IP range level: + + ```json + { + "name": "internal-network", + "action": "ALLOW", + "remote_addresses": ["100.64.0.0/10"] + } + ``` ## 1.13.0 diff --git a/docs/docs/admin/policies.md b/docs/docs/admin/policies.md index 481a455..abd6139 100644 --- a/docs/docs/admin/policies.md +++ b/docs/docs/admin/policies.md @@ -68,6 +68,8 @@ There are three actions that can be returned from a rule: Name your rules in lower case using kebab-case. Rule names will be exposed in Prometheus metrics. +### Challenge configuration + Rules can also have their own challenge settings. These are customized using the `"challenge"` key. For example, here is a rule that makes challenges artificially hard for connections with the substring "bot" in their user agent: ```json @@ -91,6 +93,33 @@ Challenges can be configured with these settings: | `report_as` | `4` | What difficulty the UI should report to the user. Useful for messing with industrial-scale scraping efforts. | | `algorithm` | `"fast"` | The algorithm used on the client to run proof-of-work calculations. This must be set to `"fast"` or `"slow"`. See [Proof-of-Work Algorithm Selection](./algorithm-selection) for more details. | +### Remote IP based filtering + +The `remote_addresses` field of a Bot rule allows you to set the IP range that this ruleset applies to. + +For example, you can allow a search engine to connect if and only if its IP address matches the ones they published: + +```json +{ + "name": "qwantbot", + "user_agent_regex": "\\+https\\:\\/\\/help\\.qwant\\.com/bot/", + "action": "ALLOW", + "remote_addresses": ["91.242.162.0/24"] +} +``` + +This also works at an IP range level without any other checks: + +```json +{ + "name": "internal-network", + "action": "ALLOW", + "remote_addresses": ["100.64.0.0/10"] +} +``` + +## Risk calculation for downstream services + In case your service needs it for risk calculation reasons, Anubis exposes information about the rules that any requests match using a few headers: | Header | Explanation | Example | diff --git a/go.mod b/go.mod index 4275c36..8caa7fa 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/facebookgo/flagenv v0.0.0-20160425205200-fcd59fca7456 github.com/golang-jwt/jwt/v5 v5.2.1 github.com/prometheus/client_golang v1.21.1 + github.com/yl2chen/cidranger v1.0.2 ) require ( diff --git a/go.sum b/go.sum index 61d1cbc..839037b 100644 --- a/go.sum +++ b/go.sum @@ -16,6 +16,7 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cli/browser v1.3.0 h1:LejqCrpWr+1pRqmEPDGnTZOjsMe7sehifLynZJuqJpo= github.com/cli/browser v1.3.0/go.mod h1:HH8s+fOAxjhQoBUAsKuPCbqUuxZDhQ2/aD+SzsEfBTk= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/facebookgo/ensure v0.0.0-20160127193407-b4ab57deab51 h1:0JZ+dUmQeA8IIVUMzysrX4/AKuQwWhV2dYQuPZdvdSQ= @@ -47,6 +48,7 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/natefinch/atomic v1.0.1 h1:ZPYKxkqQOx3KZ+RsbnP/YsgvxWQPGxjC0oBt2AhwV0A= github.com/natefinch/atomic v1.0.1/go.mod h1:N/D/ELrljoqDyT3rZrsUmtsuzvHkeB/wWjHV22AZRbM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk= @@ -57,8 +59,12 @@ github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/yl2chen/cidranger v1.0.2 h1:lbOWZVCG1tCRX4u24kuM1Tb4nHqWkDxwLdoS+SevawU= +github.com/yl2chen/cidranger v1.0.2/go.mod h1:9U1yz7WPYDwf0vpNWFaeRh0bjwz5RVgRy/9UEQfHl0g= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= @@ -137,5 +143,7 @@ golang.org/x/tools v0.31.0/go.mod h1:naFTU+Cev749tSJRXJlna0T3WxKvb1kWEx15xA4SdmQ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/protobuf v1.36.4 h1:6A3ZDJHn/eNqc1i+IdefRzy/9PokBTPvcqMySR7NNIM= google.golang.org/protobuf v1.36.4/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/headers.go b/internal/headers.go index 47aa2cc..1de845d 100644 --- a/internal/headers.go +++ b/internal/headers.go @@ -1,6 +1,7 @@ package internal import ( + "log/slog" "net/http" "github.com/TecharoHQ/anubis" @@ -8,13 +9,27 @@ import ( // UnchangingCache sets the Cache-Control header to cache a response for 1 year if // and only if the application is compiled in "release" mode by Docker. -func UnchangingCache(h http.Handler) http.Handler { +func UnchangingCache(next http.Handler) http.Handler { if anubis.Version == "devel" { - return h + return next } return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Cache-Control", "public, max-age=31536000") - h.ServeHTTP(w, r) + next.ServeHTTP(w, r) + }) +} + +// DefaultXRealIP sets the X-Real-Ip header to the given value if and only if +// it is not an empty string. +func DefaultXRealIP(defaultIP string, next http.Handler) http.Handler { + if defaultIP == "" { + slog.Debug("skipping middleware, defaultIP is empty") + return next + } + + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + r.Header.Set("X-Real-Ip", defaultIP) + next.ServeHTTP(w, r) }) }