feat: honor automatic exclusions list

This commit is contained in:
Paul Pfeister 2025-09-15 21:56:54 -04:00
parent 6d15f1319e
commit 8882310450
No known key found for this signature in database
GPG Key ID: 70D33A96CBD7A994
3 changed files with 52 additions and 8 deletions

View File

@ -76,5 +76,5 @@ jobs:
git add false_positive_exclusions.txt
git commit -m "auto: Update exclusions list" || echo "No changes to commit"
git commit -m "auto: update exclusions list" || echo "No changes to commit"
git push origin exclusions

View File

@ -727,6 +727,14 @@ def main():
help="Disable creation of a txt file",
)
parser.add_argument(
"--ignore-exclusions",
action="store_true",
dest="ignore_exclusions",
default=False,
help="Ignore upstream exclusions (may return more false positives)",
)
args = parser.parse_args()
# If the user presses CTRL-C, exit gracefully without throwing errors
@ -784,7 +792,8 @@ def main():
try:
if args.local:
sites = SitesInformation(
os.path.join(os.path.dirname(__file__), "resources/data.json")
os.path.join(os.path.dirname(__file__), "resources/data.json"),
honor_exclusions=False,
)
else:
json_file_location = args.json_file
@ -804,7 +813,11 @@ def main():
head_commit_sha = pull_request_json["head"]["sha"]
json_file_location = f"https://raw.githubusercontent.com/sherlock-project/sherlock/{head_commit_sha}/sherlock_project/resources/data.json"
sites = SitesInformation(json_file_location)
sites = SitesInformation(
data_file_path=json_file_location,
honor_exclusions=not args.ignore_exclusions,
do_not_exclude=args.site_list,
)
except Exception as error:
print(f"ERROR: {error}")
sys.exit(1)

View File

@ -7,6 +7,10 @@ import json
import requests
import secrets
MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt"
class SiteInformation:
def __init__(self, name, url_home, url_username_format, username_claimed,
information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)):
@ -67,12 +71,17 @@ class SiteInformation:
Return Value:
Nicely formatted string to get information about this object.
"""
return f"{self.name} ({self.url_home})"
class SitesInformation:
def __init__(self, data_file_path=None):
def __init__(
self,
data_file_path: str|None = None,
honor_exclusions: bool = True,
do_not_exclude: list[str] = [],
):
"""Create Sites Information Object.
Contains information about all supported websites.
@ -110,7 +119,7 @@ class SitesInformation:
# The default data file is the live data.json which is in the GitHub repo. The reason why we are using
# this instead of the local one is so that the user has the most up-to-date data. This prevents
# users from creating issue about false positives which has already been fixed or having outdated data
data_file_path = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
data_file_path = MANIFEST_URL
# Ensure that specified data file has correct extension.
if not data_file_path.lower().endswith(".json"):
@ -152,9 +161,31 @@ class SitesInformation:
raise FileNotFoundError(f"Problem while attempting to access "
f"data file '{data_file_path}'."
)
site_data.pop('$schema', None)
if honor_exclusions:
try:
response = requests.get(url=EXCLUSIONS_URL)
if response.status_code == 200:
exclusions = response.text.splitlines()
exclusions = [exclusion.strip() for exclusion in exclusions]
for site in do_not_exclude:
if site in exclusions:
exclusions.remove(site)
for exclusion in exclusions:
try:
site_data.pop(exclusion, None)
except KeyError:
pass
except Exception:
# If there was any problem loading the exclusions, just continue without them
print("Warning: Could not load exclusions, continuing without them.")
honor_exclusions = False
self.sites = {}
# Add all site information from the json file to internal site list.
@ -194,7 +225,7 @@ class SitesInformation:
for site in self.sites:
if self.sites[site].is_nsfw and site.casefold() not in do_not_remove:
continue
sites[site] = self.sites[site]
sites[site] = self.sites[site]
self.sites = sites
def site_name_list(self):