Merge branch 'master' into Issue#2442

2025-09-20 20:12:21 -04:00 · 2025-09-20 20:12:21 -04:00 · 37b30602fd
parent 7afdee4c58 d4d8e01e31
commit 37b30602fd
17 changed files with 625 additions and 49 deletions
--- a/.github/workflows/exclusions.yml
+++ b/.github/workflows/exclusions.yml
@ -0,0 +1,89 @@
+name: Exclusions Updater
+
+on:
+  schedule:
+    #- cron: '0 5 * * 0'  # Runs at 05:00 every Sunday
+    - cron: '0 5 * * *' # Runs at 05:00 every day
+  workflow_dispatch:
+
+jobs:
+  update-exclusions:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
+      - name: Install Poetry
+        uses: abatilo/actions-poetry@v4
+        with:
+          poetry-version: 'latest'
+
+      - name: Install dependencies
+        run: |
+          poetry install --no-interaction --with dev
+
+      - name: Run false positive tests
+        run: |
+          $(poetry env activate)
+          pytest -q --tb no -m validate_targets_fp -n 20 | tee fp_test_results.txt
+          deactivate
+
+      - name: Parse false positive detections by desired categories
+        run: |
+          grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was Claimed)' fp_test_results.txt \
+            | sort -u > false_positive_exclusions.txt
+          grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was WAF)' fp_test_results.txt \
+            | sort -u > waf_hits.txt
+
+      - name: Detect if exclusions list changed
+        id: detect_changes
+        run: |
+          git fetch origin exclusions || true
+
+          if git show origin/exclusions:false_positive_exclusions.txt >/dev/null 2>&1; then
+            # If the exclusions branch and file exist, compare
+            if git diff --quiet origin/exclusions -- false_positive_exclusions.txt; then
+              echo "exclusions_changed=false" >> "$GITHUB_OUTPUT"
+            else
+              echo "exclusions_changed=true" >> "$GITHUB_OUTPUT"
+            fi
+          else
+            # If the exclusions branch or file do not exist, treat as changed
+            echo "exclusions_changed=true" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Quantify and display results
+        run: |
+          FP_COUNT=$(wc -l < false_positive_exclusions.txt | xargs)
+          WAF_COUNT=$(wc -l < waf_hits.txt | xargs)
+          echo ">>> Found $FP_COUNT false positives and $WAF_COUNT WAF hits."
+          echo ">>> False positive exclusions:" && cat false_positive_exclusions.txt
+          echo ">>> WAF hits:" && cat waf_hits.txt
+
+      - name: Commit and push exclusions list
+        if: steps.detect_changes.outputs.exclusions_changed == 'true'
+        run: |
+          git config user.name "Paul Pfeister (automation)"
+          git config user.email "code@pfeister.dev"
+
+          mv false_positive_exclusions.txt false_positive_exclusions.txt.tmp
+
+          git add -f false_positive_exclusions.txt.tmp # -f required to override .gitignore
+          git stash push -m "stash false positive exclusion list" -- false_positive_exclusions.txt.tmp
+
+          git fetch origin exclusions || true # Allows creation of branch if deleted
+          git checkout -B exclusions origin/exclusions || (git checkout --orphan exclusions && git rm -rf .)
+
+          git stash pop || true
+
+          mv false_positive_exclusions.txt.tmp false_positive_exclusions.txt
+
+          git rm -f false_positive_exclusions.txt.tmp || true
+          git add false_positive_exclusions.txt
+          git commit -m "auto: update exclusions list" || echo "No changes to commit"
+          git push origin exclusions
--- a/.github/workflows/regression.yml
+++ b/.github/workflows/regression.yml
@ -49,10 +49,10 @@ jobs:
          macos-latest,
        ]
        python-version: [
-          '3.9',
          '3.10',
          '3.11',
          '3.12',
+          '3.13',
        ]
    steps:
      - uses: actions/checkout@v4
--- a/.github/workflows/validate_modified_targets.yml
+++ b/.github/workflows/validate_modified_targets.yml
@ -0,0 +1,99 @@
+name: Modified Target Validation
+
+on:
+  pull_request:
+    branches:
+      - master
+    paths:
+      - "sherlock_project/resources/data.json"
+
+jobs:
+  validate-modified-targets:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+
+      - name: Install Poetry
+        uses: abatilo/actions-poetry@v4
+        with:
+          poetry-version: 'latest'
+
+      - name: Install dependencies
+        run: |
+          poetry install --no-interaction --with dev
+
+      - name: Discover modified targets
+        id: discover-modified
+        run: |
+          # Fetch the upstream branch
+          git fetch origin ${{ github.base_ref }} --depth=1
+
+          # Discover changes
+          git show origin/${{ github.base_ref }}:sherlock_project/resources/data.json > data.json.base
+          cp sherlock_project/resources/data.json data.json.head
+
+          CHANGED=$(
+            python - <<'EOF'
+          import json
+          with open("data.json.base") as f: base = json.load(f)
+          with open("data.json.head") as f: head = json.load(f)
+
+          changed = []
+          for k, v in head.items():
+              if k not in base or base[k] != v:
+                  changed.append(k)
+
+          print(",".join(sorted(changed)))
+          EOF
+          )
+
+          # Preserve changelist
+          echo -e ">>> Changed targets: \n$(echo $CHANGED | tr ',' '\n')"
+          echo "changed_targets=$CHANGED" >> "$GITHUB_OUTPUT"
+
+      - name: Validate modified targets
+        if: steps.discover-modified.outputs.changed_targets != ''
+        continue-on-error: true
+        run: |
+          $(poetry env activate)
+          pytest -q --tb no -rA -m validate_targets -n 20 --chunked-sites "${{ steps.discover-modified.outputs.changed_targets }}" --junitxml=validation_results.xml
+          deactivate
+
+      - name: Prepare validation summary
+        if: steps.discover-modified.outputs.changed_targets != ''
+        id: prepare-summary
+        run: |
+          $(poetry env activate)
+          summary=$(
+            python devel/summarize_site_validation.py validation_results.xml || echo "Failed to generate summary of test results"
+          )
+          deactivate
+          echo "$summary" > validation_summary.md
+
+      - name: Announce validation results
+        if: steps.discover-modified.outputs.changed_targets != ''
+        uses: actions/github-script@v8
+        with:
+          script: |
+            const fs = require('fs');
+            const body = fs.readFileSync('validation_summary.md', 'utf8');
+            github.rest.issues.createComment({
+              issue_number: context.payload.pull_request.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: body,
+            });
+
+      - name: This step shows as ran when no modifications are found
+        if: steps.discover-modified.outputs.changed_targets == ''
+        run: |
+          echo "No modified targets found"
--- a/2
+++ b/2
@ -2,7 +2,7 @@
  # 1. Update the version tag in the Dockerfile to match the version in sherlock/__init__.py
  # 2. Update the VCS_REF tag to match the tagged version's FULL commit hash
  # 3. Build image with BOTH latest and version tags
-    # i.e. `docker build -t sherlock/sherlock:0.15.0 -t sherlock/sherlock:latest .`
+    # i.e. `docker build -t sherlock/sherlock:0.16.0 -t sherlock/sherlock:latest .`

 FROM python:3.12-slim-bullseye as build
 WORKDIR /sherlock
--- a/devel/summarize_site_validation.py
+++ b/devel/summarize_site_validation.py
@ -0,0 +1,72 @@
+#!/usr/bin/env python
+# This module summarizes the results of site validation tests queued by
+# workflow validate_modified_targets for presentation in Issue comments.
+
+from defusedxml import ElementTree as ET
+import sys
+from pathlib import Path
+
+def summarize_junit_xml(xml_path: Path) -> str:
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    suite = root.find('testsuite')
+
+    pass_message: str = ":heavy_check_mark: &nbsp; Pass"
+    fail_message: str = ":x: &nbsp; Fail"
+
+    if suite is None:
+        raise ValueError("Invalid JUnit XML: No testsuite found")
+
+    summary_lines: list[str] = []
+    summary_lines.append("#### Automatic validation of changes\n")
+    summary_lines.append("| Target | F+ Check | F- Check |")
+    summary_lines.append("|---|---|---|")
+
+    failures = int(suite.get('failures', 0))
+    errors_detected: bool = False
+
+    results: dict[str, dict[str, str]] = {}
+
+    for testcase in suite.findall('testcase'):
+        test_name = testcase.get('name').split('[')[0]
+        site_name = testcase.get('name').split('[')[1].rstrip(']')
+        failure = testcase.find('failure')
+        error = testcase.find('error')
+
+        if site_name not in results:
+            results[site_name] = {}
+
+        if test_name == "test_false_neg":
+            results[site_name]['F- Check'] = pass_message if failure is None and error is None else fail_message
+        elif test_name == "test_false_pos":
+            results[site_name]['F+ Check'] = pass_message if failure is None and error is None else fail_message
+
+        if error is not None:
+            errors_detected = True
+
+    for result in results:
+        summary_lines.append(f"| {result} | {results[result].get('F+ Check', 'Error!')} | {results[result].get('F- Check', 'Error!')} |")
+
+    if failures > 0:
+        summary_lines.append("\n___\n" +
+            "\nFailures were detected on at least one updated target. Commits containing accuracy failures" +
+            " will often not be merged (unless a rationale is provided, such as false negatives due to regional differences).")
+
+    if errors_detected:
+        summary_lines.append("\n___\n" +
+            "\n**Errors were detected during validation. Please review the workflow logs.**")
+
+    return "\n".join(summary_lines)
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: summarize_site_validation.py <junit-xml-file>")
+        sys.exit(1)
+
+    xml_path: Path = Path(sys.argv[1])
+    if not xml_path.is_file():
+        print(f"Error: File '{xml_path}' does not exist.")
+        sys.exit(1)
+
+    summary: str = summarize_junit_xml(xml_path)
+    print(summary)
--- a/docs/README.md
+++ b/docs/README.md
@ -1,6 +1,6 @@
-<p align=center>
+<p align="center">
  <br>
-  <a href="https://sherlock-project.github.io/" target="_blank"><img src="images/sherlock-logo.png"/></a>
+  <a href="https://sherlock-project.github.io/" target="_blank"><img src="images/sherlock-logo.png" alt="sherlock"/></a>
  <br>
  <span>Hunt down social media accounts by username across <a href="https://sherlockproject.xyz/sites">400+ social networks</a></span>
  <br>
@ -15,8 +15,7 @@
 </p>

 <p align="center">
-<img width="70%" height="70%" src="images/demo.png"/>
-</a>
+<img width="70%" height="70%" src="images/demo.png" alt="demo"/>
 </p>


@ -112,17 +111,17 @@ $ echo '{"usernames":["user123"]}' | apify call -so netmilk/sherlock
    "https://www.1337x.to/user/user123/",
    ...
  ]
-}]s
+}]
 ```

-Read more about the [Sherlock Actor](../.actor/README.md), including how to use it programmaticaly via the Apify [API](https://apify.com/netmilk/sherlock/api?fpr=sherlock), [CLI](https://docs.apify.com/cli/?fpr=sherlock) and [JS/TS and Python SDKs](https://docs.apify.com/sdk?fpr=sherlock).
+Read more about the [Sherlock Actor](../.actor/README.md), including how to use it programmatically via the Apify [API](https://apify.com/netmilk/sherlock/api?fpr=sherlock), [CLI](https://docs.apify.com/cli/?fpr=sherlock) and [JS/TS and Python SDKs](https://docs.apify.com/sdk?fpr=sherlock).

 ## Credits

 Thank you to everyone who has contributed to Sherlock! ❤️

 <a href="https://github.com/sherlock-project/sherlock/graphs/contributors">
-  <img src="https://contrib.rocks/image?&columns=25&max=10000&&repo=sherlock-project/sherlock" noZoom />
+  <img src="https://contrib.rocks/image?&columns=25&max=10000&&repo=sherlock-project/sherlock" alt="contributors"/>
 </a>

 ## Star history
--- a/docs/removed-sites.md
+++ b/docs/removed-sites.md
@ -1982,3 +1982,16 @@ __2025-02-16 :__ Unsure if any way to view profiles exists now
    "username_claimed": "t3dotgg"
  }
 ```
+
+## TorrentGalaxy
+__2025-07-06 :__ Site appears to have gone offline in March and hasn't come back
+```json
+  "TorrentGalaxy": {
+    "errorMsg": "<title>TGx:Can't show details</title>",
+    "errorType": "message",
+    "regexCheck": "^[A-Za-z0-9]{3,15}$",
+    "url": "https://torrentgalaxy.to/profile/{}",
+    "urlMain": "https://torrentgalaxy.to/",
+    "username_claimed": "GalaxyRG"
+  },
+```
--- a/pyproject.toml
+++ b/pyproject.toml
@ -8,8 +8,7 @@ source = "init"

 [tool.poetry]
 name = "sherlock-project"
-# single source of truth for version is __init__.py
-version = "0"
+version = "0.16.0"
 description = "Hunt down social media accounts by username across social networks"
 license = "MIT"
 authors = [
@ -50,12 +49,20 @@ stem = "^1.8.0"
 torrequest = "^0.1.0"
 pandas = "^2.2.1"
 openpyxl = "^3.0.10"
+tomli = "^2.2.1"

 [tool.poetry.extras]
 tor = ["torrequest"]

 [tool.poetry.group.dev.dependencies]
 jsonschema = "^4.0.0"
+rstr = "^3.2.2"
+pytest = "^8.4.2"
+pytest-xdist = "^3.8.0"
+
+
+[tool.poetry.group.ci.dependencies]
+defusedxml = "^0.7.1"

 [tool.poetry.scripts]
 sherlock = 'sherlock_project.sherlock:main'
--- a/pytest.ini
+++ b/pytest.ini
@ -1,4 +1,7 @@
 [pytest]
-addopts = --strict-markers
+addopts = --strict-markers -m "not validate_targets"
 markers =
    online: mark tests are requiring internet access.
+    validate_targets: mark tests for sweeping manifest validation (sends many requests).
+    validate_targets_fp: validate_targets, false positive tests only.
+    validate_targets_fn: validate_targets, false negative tests only.
--- a/sherlock_project/init.py
+++ b/sherlock_project/init.py
@ -5,11 +5,26 @@ networks.

 """

+from importlib.metadata import version as pkg_version, PackageNotFoundError
+import pathlib
+import tomli
+
+
+def get_version() -> str:
+    """Fetch the version number of the installed package."""
+    try:
+        return pkg_version("sherlock_project")
+    except PackageNotFoundError:
+        pyproject_path: pathlib.Path = pathlib.Path(__file__).resolve().parent.parent / "pyproject.toml"
+        with pyproject_path.open("rb") as f:
+            pyproject_data = tomli.load(f)
+        return pyproject_data["tool"]["poetry"]["version"]
+
 # This variable is only used to check for ImportErrors induced by users running as script rather than as module or package
 import_error_test_var = None

 __shortname__   = "Sherlock"
 __longname__    = "Sherlock: Find Usernames Across Social Networks"
-__version__     = "0.15.0"
+__version__     = get_version()

 forge_api_latest_release = "https://api.github.com/repos/sherlock-project/sherlock/releases/latest"
--- a/sherlock_project/resources/data.json
+++ b/sherlock_project/resources/data.json
@ -258,6 +258,12 @@
    "urlMain": "https://www.blipfoto.com/",
    "username_claimed": "blue"
  },
+  "Blitz Tactics": {
+    "errorType": "status_code",
+    "url": "https://blitztactics.com/{}",
+    "urlMain": "https://blitztactics.com/",
+    "username_claimed": "Lance5500"
+  },
  "Blogger": {
    "errorType": "status_code",
    "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$",
@ -265,6 +271,13 @@
    "urlMain": "https://www.blogger.com/",
    "username_claimed": "blue"
  },
+  "Bluesky": {
+    "errorType": "status_code",
+    "url": "https://bsky.app/profile/{}.bsky.social",
+    "urlProbe": "https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile?actor={}.bsky.social",
+    "urlMain": "https://bsky.app/",
+    "username_claimed": "mcuban"
+  },
  "BoardGameGeek": {
    "errorType": "message",
    "regexCheck": "^[a-zA-Z0-9_]*$",
@ -365,6 +378,12 @@
    "urlMain": "https://career.habr.com/",
    "username_claimed": "blue"
  },
+  "CashApp": {
+    "errorType": "status_code",
+    "url": "https://cash.app/${}",
+    "urlMain": "https://cash.app",
+    "username_claimed": "hotdiggitydog"
+  },
  "Championat": {
    "errorType": "status_code",
    "url": "https://www.championat.com/user/{}",
@ -603,7 +622,7 @@
      "urlMain": "https://forums.digitalspy.com/",
      "username_claimed": "blue",
      "regexCheck": "^\\w{3,20}$"
-    },
+  },
  "Discogs": {
    "errorType": "status_code",
    "url": "https://www.discogs.com/user/{}",
@ -789,13 +808,12 @@
    "urlMain": "https://fosstodon.org/",
    "username_claimed": "blue"
  },
-  "Freelance.habr": {
-    "errorMsg": "<div class=\"icon_user_locked\"></div>",
-    "errorType": "message",
-    "regexCheck": "^((?!\\.).)*$",
-    "url": "https://freelance.habr.com/freelancers/{}",
-    "urlMain": "https://freelance.habr.com/",
-    "username_claimed": "adam"
+  "Framapiaf": {
+    "errorType": "status_code",
+    "regexCheck": "^[a-zA-Z0-9_]{1,30}$",
+    "url": "https://framapiaf.org/@{}",
+    "urlMain": "https://framapiaf.org",
+    "username_claimed": "pylapp"
  },
  "Freelancer": {
    "errorMsg": "\"users\":{}",
@ -1124,6 +1142,13 @@
    "urlProbe": "https://imginn.com/{}",
    "username_claimed": "instagram"
  },
+  "Instapaper": {
+    "errorType": "status_code",
+    "request_method": "GET",
+    "url": "https://www.instapaper.com/p/{}",
+    "urlMain": "https://www.instapaper.com/",
+    "username_claimed": "john"
+  },
  "Instructables": {
    "errorType": "status_code",
    "url": "https://www.instructables.com/member/{}",
@ -1236,6 +1261,13 @@
    "urlMain": "https://linux.org.ru/",
    "username_claimed": "red"
  },
+  "Laracast": {
+    "errorType":"status_code",
+    "url": "https://laracasts.com/@{}",
+    "urlMain": "https://laracasts.com/",
+    "regexCheck": "^[a-zA-Z0-9_-]{3,}$",
+    "username_claimed": "user1"
+  },
  "Launchpad": {
    "errorType": "status_code",
    "url": "https://launchpad.net/~{}",
@ -1279,7 +1311,6 @@
  },
  "LinkedIn": {
    "errorType": "status_code",
-
    "regexCheck": "^[a-zA-Z0-9]{3,100}$",
    "request_method": "GET",
    "url": "https://linkedin.com/in/{}",
@ -1294,6 +1325,12 @@
    "urlMain": "https://linktr.ee/",
    "username_claimed": "anne"
  },
+  "LinuxFR.org": {
+    "errorType": "status_code",
+    "url": "https://linuxfr.org/users/{}",
+    "urlMain": "https://linuxfr.org/",
+    "username_claimed": "pylapp"
+  },
  "Listed": {
    "errorType": "response_url",
    "errorUrl": "https://listed.to/@{}",
@ -1334,6 +1371,13 @@
    "urlMain": "https://forums.mmorpg.com/",
    "username_claimed": "goku"
  },
+  "Mamot": {
+    "errorType": "status_code",
+    "regexCheck": "^[a-zA-Z0-9_]{1,30}$",
+    "url": "https://mamot.fr/@{}",
+    "urlMain": "https://mamot.fr/",
+    "username_claimed": "anciensEnssat"
+  },
  "Medium": {
    "errorMsg": "<body",
    "errorType": "message",
@ -1349,8 +1393,8 @@
    "username_claimed": "blue"
  },
  "Minecraft": {
-    "errorCode": 204,
-    "errorType": "status_code",
+    "errorMsg": "Couldn't find any profile with name",
+    "errorType": "message",
    "url": "https://api.mojang.com/users/profiles/minecraft/{}",
    "urlMain": "https://minecraft.net/",
    "username_claimed": "blue"
@ -1495,6 +1539,13 @@
    "urlMain": "https://nyaa.si/",
    "username_claimed": "blue"
  },
+  "Open Collective": {
+    "errorMsg": "Oops! Page not found",
+    "errorType": "message",
+    "url": "https://opencollective.com/{}",
+    "urlMain": "https://opencollective.com/",
+    "username_claimed": "pylapp"
+  },
  "OpenStreetMap": {
    "errorType": "status_code",
    "regexCheck": "^[^.]*?$",
@ -1515,6 +1566,13 @@
    "urlMain": "https://ourdjtalk.com/",
    "username_claimed": "steve"
  },
+  "Outgress": {
+    "errorMsg": "Outgress - Error",
+    "errorType": "message",
+    "url": "https://outgress.com/agents/{}",
+    "urlMain": "https://outgress.com/",
+    "username_claimed": "pylapp"
+  },
  "PCGamer": {
    "errorMsg": "The specified member cannot be found. Please enter a member's entire name.",
    "errorType": "message",
@ -1576,12 +1634,31 @@
    "urlMain": "https://www.pinkbike.com/",
    "username_claimed": "blue"
  },
+  "pixelfed.social": {
+    "errorType": "status_code",
+    "url": "https://pixelfed.social/{}/",
+    "urlMain": "https://pixelfed.social",
+    "username_claimed": "pylapp"
+  },
  "PlayStore": {
    "errorType": "status_code",
    "url": "https://play.google.com/store/apps/developer?id={}",
    "urlMain": "https://play.google.com/store",
    "username_claimed": "Facebook"
  },
+  "Playstrategy": {
+    "errorType": "status_code",
+    "url": "https://playstrategy.org/@/{}",
+    "urlMain": "https://playstrategy.org",
+    "username_claimed": "oruro"
+  },
+  "Plurk": {
+    "errorMsg": "User Not Found!",
+    "errorType": "message",
+    "url": "https://www.plurk.com/{}",
+    "urlMain": "https://www.plurk.com/",
+    "username_claimed": "plurkoffice"
+  },
  "PocketStars": {
    "errorMsg": "Join Your Favorite Adult Stars",
    "errorType": "message",
@ -1629,6 +1706,20 @@
    "urlMain": "https://www.producthunt.com/",
    "username_claimed": "jenny"
  },
+  "programming.dev": {
+    "errorMsg": "Error!",
+    "errorType": "message",
+    "url": "https://programming.dev/u/{}",
+    "urlMain": "https://programming.dev",
+    "username_claimed": "pylapp"
+  },
+  "Pychess": {
+  "errorType": "message",
+  "errorMsg": "404",
+  "url": "https://www.pychess.org/@/{}",
+  "urlMain": "https://www.pychess.org",
+  "username_claimed": "gbtami"
+  },
  "PromoDJ": {
    "errorType": "status_code",
    "url": "http://promodj.com/{}",
@ -1880,6 +1971,12 @@
    "urlMain": "https://soylentnews.org",
    "username_claimed": "adam"
  },
+  "SpeakerDeck": {
+    "errorType": "status_code",
+    "url": "https://speakerdeck.com/{}",
+    "urlMain": "https://speakerdeck.com/",
+    "username_claimed": "pylapp"
+  },
  "Speedrun.com": {
    "errorType": "status_code",
    "url": "https://speedrun.com/users/{}",
@ -2029,14 +2126,6 @@
    "urlMain": "https://www.tnaflix.com/",
    "username_claimed": "hacker"
  },
-  "TorrentGalaxy": {
-    "errorMsg": "<title>TGx:Can't show details</title>",
-    "errorType": "message",
-    "regexCheck": "^[A-Za-z0-9]{3,15}$",
-    "url": "https://torrentgalaxy.to/profile/{}",
-    "urlMain": "https://torrentgalaxy.to/",
-    "username_claimed": "GalaxyRG"
-  },
  "TradingView": {
    "errorType": "status_code",
    "request_method": "GET",
@ -2719,12 +2808,24 @@
    "urlMain": "https://www.toster.ru/",
    "username_claimed": "adam"
  },
+  "tumblr": {
+    "errorType": "status_code",
+    "url": "https://{}.tumblr.com/",
+    "urlMain": "https://www.tumblr.com/",
+    "username_claimed": "goku"
+},
  "uid": {
    "errorType": "status_code",
    "url": "http://uid.me/{}",
    "urlMain": "https://uid.me/",
    "username_claimed": "blue"
  },
+  "write.as": {
+    "errorType": "status_code",
+    "url": "https://write.as/{}",
+    "urlMain": "https://write.as",
+    "username_claimed": "pylapp"
+  },
  "xHamster": {
    "errorType": "status_code",
    "isNSFW": true,
@ -2745,5 +2846,13 @@
    "urlProbe": "https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile?actor={}.bsky.social",
    "urlMain": "https://bsky.app/",
    "username_claimed": "mcuban"
+  },
+  "Platzi": {
+    "errorType": "status_code",
+    "errorCode": 404,
+    "url": "https://platzi.com/p/{}/",
+    "urlMain": "https://platzi.com/",
+    "username_claimed": "freddier",
+    "request_method": "GET"
  }
 }
--- a/sherlock_project/sherlock.py
+++ b/sherlock_project/sherlock.py
@ -169,14 +169,14 @@ def multiple_usernames(username):

 def sherlock(
    username: str,
-    site_data: dict,
+    site_data: dict[str, dict[str, str]],
    query_notify: QueryNotify,
    tor: bool = False,
    unique_tor: bool = False,
    dump_response: bool = False,
    proxy: Optional[str] = None,
    timeout: int = 60,
-):
+) -> dict[str, dict[str, str | QueryResult]]:
    """Run Sherlock Analysis.

    Checks for existence of username on various social media sites.
@ -507,7 +507,7 @@ def sherlock(
            print("+++++++++++++++++++++")

        # Notify caller about results of query.
-        result = QueryResult(
+        result: QueryResult = QueryResult(
            username=username,
            site_name=social_network,
            site_url_user=url,
@ -727,6 +727,14 @@ def main():
        help="Disable creation of a txt file",
    )

+    parser.add_argument(
+        "--ignore-exclusions",
+        action="store_true",
+        dest="ignore_exclusions",
+        default=False,
+        help="Ignore upstream exclusions (may return more false positives)",
+    )
+
    args = parser.parse_args()

    # If the user presses CTRL-C, exit gracefully without throwing errors
@ -784,7 +792,8 @@ def main():
    try:
        if args.local:
            sites = SitesInformation(
-                os.path.join(os.path.dirname(__file__), "resources/data.json")
+                os.path.join(os.path.dirname(__file__), "resources/data.json"),
+                honor_exclusions=False,
            )
        else:
            json_file_location = args.json_file
@ -804,7 +813,11 @@ def main():
                    head_commit_sha = pull_request_json["head"]["sha"]
                    json_file_location = f"https://raw.githubusercontent.com/sherlock-project/sherlock/{head_commit_sha}/sherlock_project/resources/data.json"

-            sites = SitesInformation(json_file_location)
+            sites = SitesInformation(
+                data_file_path=json_file_location,
+                honor_exclusions=not args.ignore_exclusions,
+                do_not_exclude=args.site_list,
+            )
    except Exception as error:
        print(f"ERROR:  {error}")
        sys.exit(1)
--- a/sherlock_project/sites.py
+++ b/sherlock_project/sites.py
@ -7,6 +7,10 @@ import json
 import requests
 import secrets

+
+MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
+EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt"
+
 class SiteInformation:
    def __init__(self, name, url_home, url_username_format, username_claimed,
                information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)):
@ -67,12 +71,17 @@ class SiteInformation:
        Return Value:
        Nicely formatted string to get information about this object.
        """
-        
+
        return f"{self.name} ({self.url_home})"


 class SitesInformation:
-    def __init__(self, data_file_path=None):
+    def __init__(
+            self,
+            data_file_path: str|None = None,
+            honor_exclusions: bool = True,
+            do_not_exclude: list[str] = [],
+        ):
        """Create Sites Information Object.

        Contains information about all supported websites.
@ -110,7 +119,7 @@ class SitesInformation:
            # The default data file is the live data.json which is in the GitHub repo. The reason why we are using
            # this instead of the local one is so that the user has the most up-to-date data. This prevents
            # users from creating issue about false positives which has already been fixed or having outdated data
-            data_file_path = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
+            data_file_path = MANIFEST_URL

        # Ensure that specified data file has correct extension.
        if not data_file_path.lower().endswith(".json"):
@ -152,9 +161,31 @@ class SitesInformation:
                raise FileNotFoundError(f"Problem while attempting to access "
                                        f"data file '{data_file_path}'."
                                        )
-        
+
        site_data.pop('$schema', None)

+        if honor_exclusions:
+            try:
+                response = requests.get(url=EXCLUSIONS_URL)
+                if response.status_code == 200:
+                    exclusions = response.text.splitlines()
+                    exclusions = [exclusion.strip() for exclusion in exclusions]
+
+                    for site in do_not_exclude:
+                        if site in exclusions:
+                            exclusions.remove(site)
+
+                    for exclusion in exclusions:
+                        try:
+                            site_data.pop(exclusion, None)
+                        except KeyError:
+                            pass
+
+            except Exception:
+                # If there was any problem loading the exclusions, just continue without them
+                print("Warning: Could not load exclusions, continuing without them.")
+                honor_exclusions = False
+
        self.sites = {}

        # Add all site information from the json file to internal site list.
@ -194,7 +225,7 @@ class SitesInformation:
        for site in self.sites:
            if self.sites[site].is_nsfw and site.casefold() not in do_not_remove:
                continue
-            sites[site] = self.sites[site]  
+            sites[site] = self.sites[site]
        self.sites =  sites

    def site_name_list(self):
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -4,6 +4,11 @@ import urllib
 import pytest
 from sherlock_project.sites import SitesInformation

+def fetch_local_manifest(honor_exclusions: bool = True) -> dict[str, dict[str, str]]:
+    sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"), honor_exclusions=honor_exclusions)
+    sites_iterable: dict[str, dict[str, str]] = {site.name: site.information for site in sites_obj}
+    return sites_iterable
+
@pytest.fixture()
 def sites_obj():
    sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
@ -11,9 +16,7 @@ def sites_obj():

@pytest.fixture(scope="session")
 def sites_info():
-    sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
-    sites_iterable = {site.name: site.information for site in sites_obj}
-    yield sites_iterable
+    yield fetch_local_manifest()

@pytest.fixture(scope="session")
 def remote_schema():
@ -21,3 +24,28 @@ def remote_schema():
    with urllib.request.urlopen(schema_url) as remoteschema:
        schemadat = json.load(remoteschema)
    yield schemadat
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--chunked-sites",
+        action="store",
+        default=None,
+        help="For tests utilizing chunked sites, include only the (comma-separated) site(s) specified.",
+    )
+
+def pytest_generate_tests(metafunc):
+    if "chunked_sites" in metafunc.fixturenames:
+        sites_info = fetch_local_manifest(honor_exclusions=False)
+
+        # Ingest and apply site selections
+        site_filter: str | None = metafunc.config.getoption("--chunked-sites")
+        if site_filter:
+            selected_sites: list[str] = [site.strip() for site in site_filter.split(",")]
+            sites_info = {
+                site: data for site, data in sites_info.items()
+                if site in selected_sites
+            }
+
+        params = [{name: data} for name, data in sites_info.items()]
+        ids = list(sites_info.keys())
+        metafunc.parametrize("chunked_sites", params, ids=ids)
--- a/tests/test_manifest.py
+++ b/tests/test_manifest.py
@ -7,7 +7,7 @@ def test_validate_manifest_against_local_schema():
    """Ensures that the manifest matches the local schema, for situations where the schema is being changed."""
    json_relative: str = '../sherlock_project/resources/data.json'
    schema_relative: str = '../sherlock_project/resources/data.schema.json'
-    
+
    json_path: str = os.path.join(os.path.dirname(__file__), json_relative)
    schema_path: str = os.path.join(os.path.dirname(__file__), schema_relative)

--- a/tests/test_validate_targets.py
+++ b/tests/test_validate_targets.py
@ -0,0 +1,99 @@
+import pytest
+import re
+import rstr
+
+from sherlock_project.sherlock import sherlock
+from sherlock_project.notify import QueryNotify
+from sherlock_project.result import QueryResult, QueryStatus
+
+
+FALSE_POSITIVE_ATTEMPTS: int = 2    # Since the usernames are randomly generated, it's POSSIBLE that a real username can be hit
+FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND: int = 15  # If a pattern uses quantifiers such as `+` `*` or `{n,}`, limit the upper bound (0 to disable)
+FALSE_POSITIVE_DEFAULT_PATTERN: str = r'^[a-zA-Z0-9]{7,20}$'  # Used in absence of a regexCheck entry
+
+
+def set_pattern_upper_bound(pattern: str, upper_bound: int = FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND) -> str:
+    """Set upper bound for regex patterns that use quantifiers such as `+` `*` or `{n,}`."""
+    def replace_upper_bound(match: re.Match) -> str: # type: ignore
+        lower_bound: int = int(match.group(1)) if match.group(1) else 0 # type: ignore
+        upper_bound = upper_bound if lower_bound < upper_bound else lower_bound # type: ignore  # noqa: F823
+        return f'{{{lower_bound},{upper_bound}}}'
+
+    pattern = re.sub(r'(?<!\\)\{(\d+),\}', replace_upper_bound, pattern) # {n,} # type: ignore
+    pattern = re.sub(r'(?<!\\)\+', f'{{1,{upper_bound}}}', pattern) # +
+    pattern = re.sub(r'(?<!\\)\*', f'{{0,{upper_bound}}}', pattern) # *
+
+    return pattern
+
+def false_positive_check(sites_info: dict[str, dict[str, str]], site: str, pattern: str) -> QueryStatus:
+    """Check if a site is likely to produce false positives."""
+    status: QueryStatus = QueryStatus.UNKNOWN
+
+    for _ in range(FALSE_POSITIVE_ATTEMPTS):
+        query_notify: QueryNotify = QueryNotify()
+        username: str = rstr.xeger(pattern)
+
+        result: QueryResult | str = sherlock(
+            username=username,
+            site_data=sites_info,
+            query_notify=query_notify,
+        )[site]['status']
+
+        if not hasattr(result, 'status'):
+            raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}")
+        if type(result.status) is not QueryStatus: # type: ignore
+            raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore
+        status = result.status # type: ignore
+
+        if status in (QueryStatus.AVAILABLE, QueryStatus.WAF):
+            return status
+
+    return status
+
+
+def false_negative_check(sites_info: dict[str, dict[str, str]], site: str) -> QueryStatus:
+    """Check if a site is likely to produce false negatives."""
+    status: QueryStatus = QueryStatus.UNKNOWN
+    query_notify: QueryNotify = QueryNotify()
+
+    result: QueryResult | str = sherlock(
+        username=sites_info[site]['username_claimed'],
+        site_data=sites_info,
+        query_notify=query_notify,
+    )[site]['status']
+
+    if not hasattr(result, 'status'):
+            raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}")
+    if type(result.status) is not QueryStatus: # type: ignore
+        raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore
+    status = result.status # type: ignore
+
+    return status
+
+@pytest.mark.validate_targets
+@pytest.mark.online
+class Test_All_Targets:
+
+    @pytest.mark.validate_targets_fp
+    def test_false_pos(self, chunked_sites: dict[str, dict[str, str]]):
+        """Iterate through all sites in the manifest to discover possible false-positive inducting targets."""
+        pattern: str
+        for site in chunked_sites:
+            try:
+                pattern = chunked_sites[site]['regexCheck']
+            except KeyError:
+                pattern = FALSE_POSITIVE_DEFAULT_PATTERN
+
+            if FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND > 0:
+                pattern = set_pattern_upper_bound(pattern)
+
+            result: QueryStatus = false_positive_check(chunked_sites, site, pattern)
+            assert result is QueryStatus.AVAILABLE, f"{site} produced false positive with pattern {pattern}, result was {result}"
+
+    @pytest.mark.validate_targets_fn
+    def test_false_neg(self, chunked_sites: dict[str, dict[str, str]]):
+        """Iterate through all sites in the manifest to discover possible false-negative inducting targets."""
+        for site in chunked_sites:
+            result: QueryStatus = false_negative_check(chunked_sites, site)
+            assert result is QueryStatus.CLAIMED, f"{site} produced false negative, result was {result}"
+
--- a/tox.ini
+++ b/tox.ini
@ -7,8 +7,6 @@ envlist =
    py312
    py311
    py310
-    py39
-    py38

 [testenv]
 description = Attempt to build and install the package
@ -16,6 +14,7 @@ deps =
    coverage
    jsonschema
    pytest
+    rstr
 allowlist_externals = coverage
 commands =
    coverage run --source=sherlock_project --module pytest -v
@ -37,7 +36,7 @@ commands =

 [gh-actions]
 python =
+    3.13: py313
    3.12: py312
    3.11: py311
    3.10: py310
-    3.9: py39