From 888231045082ee9b627b6a7a51668255b56c2b5c Mon Sep 17 00:00:00 2001 From: Paul Pfeister Date: Mon, 15 Sep 2025 21:56:54 -0400 Subject: [PATCH] feat: honor automatic exclusions list --- .github/workflows/exclusions.yml | 2 +- sherlock_project/sherlock.py | 17 +++++++++++-- sherlock_project/sites.py | 41 ++++++++++++++++++++++++++++---- 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/.github/workflows/exclusions.yml b/.github/workflows/exclusions.yml index dd834167..a4bc0449 100644 --- a/.github/workflows/exclusions.yml +++ b/.github/workflows/exclusions.yml @@ -76,5 +76,5 @@ jobs: git add false_positive_exclusions.txt - git commit -m "auto: Update exclusions list" || echo "No changes to commit" + git commit -m "auto: update exclusions list" || echo "No changes to commit" git push origin exclusions diff --git a/sherlock_project/sherlock.py b/sherlock_project/sherlock.py index e3786c90..250175a5 100644 --- a/sherlock_project/sherlock.py +++ b/sherlock_project/sherlock.py @@ -727,6 +727,14 @@ def main(): help="Disable creation of a txt file", ) + parser.add_argument( + "--ignore-exclusions", + action="store_true", + dest="ignore_exclusions", + default=False, + help="Ignore upstream exclusions (may return more false positives)", + ) + args = parser.parse_args() # If the user presses CTRL-C, exit gracefully without throwing errors @@ -784,7 +792,8 @@ def main(): try: if args.local: sites = SitesInformation( - os.path.join(os.path.dirname(__file__), "resources/data.json") + os.path.join(os.path.dirname(__file__), "resources/data.json"), + honor_exclusions=False, ) else: json_file_location = args.json_file @@ -804,7 +813,11 @@ def main(): head_commit_sha = pull_request_json["head"]["sha"] json_file_location = f"https://raw.githubusercontent.com/sherlock-project/sherlock/{head_commit_sha}/sherlock_project/resources/data.json" - sites = SitesInformation(json_file_location) + sites = SitesInformation( + data_file_path=json_file_location, + honor_exclusions=not args.ignore_exclusions, + do_not_exclude=args.site_list, + ) except Exception as error: print(f"ERROR: {error}") sys.exit(1) diff --git a/sherlock_project/sites.py b/sherlock_project/sites.py index 847d1576..2ba811d7 100644 --- a/sherlock_project/sites.py +++ b/sherlock_project/sites.py @@ -7,6 +7,10 @@ import json import requests import secrets + +MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json" +EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt" + class SiteInformation: def __init__(self, name, url_home, url_username_format, username_claimed, information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)): @@ -67,12 +71,17 @@ class SiteInformation: Return Value: Nicely formatted string to get information about this object. """ - + return f"{self.name} ({self.url_home})" class SitesInformation: - def __init__(self, data_file_path=None): + def __init__( + self, + data_file_path: str|None = None, + honor_exclusions: bool = True, + do_not_exclude: list[str] = [], + ): """Create Sites Information Object. Contains information about all supported websites. @@ -110,7 +119,7 @@ class SitesInformation: # The default data file is the live data.json which is in the GitHub repo. The reason why we are using # this instead of the local one is so that the user has the most up-to-date data. This prevents # users from creating issue about false positives which has already been fixed or having outdated data - data_file_path = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json" + data_file_path = MANIFEST_URL # Ensure that specified data file has correct extension. if not data_file_path.lower().endswith(".json"): @@ -152,9 +161,31 @@ class SitesInformation: raise FileNotFoundError(f"Problem while attempting to access " f"data file '{data_file_path}'." ) - + site_data.pop('$schema', None) + if honor_exclusions: + try: + response = requests.get(url=EXCLUSIONS_URL) + if response.status_code == 200: + exclusions = response.text.splitlines() + exclusions = [exclusion.strip() for exclusion in exclusions] + + for site in do_not_exclude: + if site in exclusions: + exclusions.remove(site) + + for exclusion in exclusions: + try: + site_data.pop(exclusion, None) + except KeyError: + pass + + except Exception: + # If there was any problem loading the exclusions, just continue without them + print("Warning: Could not load exclusions, continuing without them.") + honor_exclusions = False + self.sites = {} # Add all site information from the json file to internal site list. @@ -194,7 +225,7 @@ class SitesInformation: for site in self.sites: if self.sites[site].is_nsfw and site.casefold() not in do_not_remove: continue - sites[site] = self.sites[site] + sites[site] = self.sites[site] self.sites = sites def site_name_list(self):