Merge branch 'master' into Issue#2442

This commit is contained in:
Paul Pfeister 2025-09-20 20:12:21 -04:00
commit 37b30602fd
No known key found for this signature in database
GPG Key ID: 70D33A96CBD7A994
17 changed files with 625 additions and 49 deletions

89
.github/workflows/exclusions.yml vendored Normal file
View File

@ -0,0 +1,89 @@
name: Exclusions Updater
on:
schedule:
#- cron: '0 5 * * 0' # Runs at 05:00 every Sunday
- cron: '0 5 * * *' # Runs at 05:00 every day
workflow_dispatch:
jobs:
update-exclusions:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v5
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.13'
- name: Install Poetry
uses: abatilo/actions-poetry@v4
with:
poetry-version: 'latest'
- name: Install dependencies
run: |
poetry install --no-interaction --with dev
- name: Run false positive tests
run: |
$(poetry env activate)
pytest -q --tb no -m validate_targets_fp -n 20 | tee fp_test_results.txt
deactivate
- name: Parse false positive detections by desired categories
run: |
grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was Claimed)' fp_test_results.txt \
| sort -u > false_positive_exclusions.txt
grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was WAF)' fp_test_results.txt \
| sort -u > waf_hits.txt
- name: Detect if exclusions list changed
id: detect_changes
run: |
git fetch origin exclusions || true
if git show origin/exclusions:false_positive_exclusions.txt >/dev/null 2>&1; then
# If the exclusions branch and file exist, compare
if git diff --quiet origin/exclusions -- false_positive_exclusions.txt; then
echo "exclusions_changed=false" >> "$GITHUB_OUTPUT"
else
echo "exclusions_changed=true" >> "$GITHUB_OUTPUT"
fi
else
# If the exclusions branch or file do not exist, treat as changed
echo "exclusions_changed=true" >> "$GITHUB_OUTPUT"
fi
- name: Quantify and display results
run: |
FP_COUNT=$(wc -l < false_positive_exclusions.txt | xargs)
WAF_COUNT=$(wc -l < waf_hits.txt | xargs)
echo ">>> Found $FP_COUNT false positives and $WAF_COUNT WAF hits."
echo ">>> False positive exclusions:" && cat false_positive_exclusions.txt
echo ">>> WAF hits:" && cat waf_hits.txt
- name: Commit and push exclusions list
if: steps.detect_changes.outputs.exclusions_changed == 'true'
run: |
git config user.name "Paul Pfeister (automation)"
git config user.email "code@pfeister.dev"
mv false_positive_exclusions.txt false_positive_exclusions.txt.tmp
git add -f false_positive_exclusions.txt.tmp # -f required to override .gitignore
git stash push -m "stash false positive exclusion list" -- false_positive_exclusions.txt.tmp
git fetch origin exclusions || true # Allows creation of branch if deleted
git checkout -B exclusions origin/exclusions || (git checkout --orphan exclusions && git rm -rf .)
git stash pop || true
mv false_positive_exclusions.txt.tmp false_positive_exclusions.txt
git rm -f false_positive_exclusions.txt.tmp || true
git add false_positive_exclusions.txt
git commit -m "auto: update exclusions list" || echo "No changes to commit"
git push origin exclusions

View File

@ -49,10 +49,10 @@ jobs:
macos-latest, macos-latest,
] ]
python-version: [ python-version: [
'3.9',
'3.10', '3.10',
'3.11', '3.11',
'3.12', '3.12',
'3.13',
] ]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4

View File

@ -0,0 +1,99 @@
name: Modified Target Validation
on:
pull_request:
branches:
- master
paths:
- "sherlock_project/resources/data.json"
jobs:
validate-modified-targets:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
ref: ${{ github.event.pull_request.head.sha }}
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.13'
- name: Install Poetry
uses: abatilo/actions-poetry@v4
with:
poetry-version: 'latest'
- name: Install dependencies
run: |
poetry install --no-interaction --with dev
- name: Discover modified targets
id: discover-modified
run: |
# Fetch the upstream branch
git fetch origin ${{ github.base_ref }} --depth=1
# Discover changes
git show origin/${{ github.base_ref }}:sherlock_project/resources/data.json > data.json.base
cp sherlock_project/resources/data.json data.json.head
CHANGED=$(
python - <<'EOF'
import json
with open("data.json.base") as f: base = json.load(f)
with open("data.json.head") as f: head = json.load(f)
changed = []
for k, v in head.items():
if k not in base or base[k] != v:
changed.append(k)
print(",".join(sorted(changed)))
EOF
)
# Preserve changelist
echo -e ">>> Changed targets: \n$(echo $CHANGED | tr ',' '\n')"
echo "changed_targets=$CHANGED" >> "$GITHUB_OUTPUT"
- name: Validate modified targets
if: steps.discover-modified.outputs.changed_targets != ''
continue-on-error: true
run: |
$(poetry env activate)
pytest -q --tb no -rA -m validate_targets -n 20 --chunked-sites "${{ steps.discover-modified.outputs.changed_targets }}" --junitxml=validation_results.xml
deactivate
- name: Prepare validation summary
if: steps.discover-modified.outputs.changed_targets != ''
id: prepare-summary
run: |
$(poetry env activate)
summary=$(
python devel/summarize_site_validation.py validation_results.xml || echo "Failed to generate summary of test results"
)
deactivate
echo "$summary" > validation_summary.md
- name: Announce validation results
if: steps.discover-modified.outputs.changed_targets != ''
uses: actions/github-script@v8
with:
script: |
const fs = require('fs');
const body = fs.readFileSync('validation_summary.md', 'utf8');
github.rest.issues.createComment({
issue_number: context.payload.pull_request.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: body,
});
- name: This step shows as ran when no modifications are found
if: steps.discover-modified.outputs.changed_targets == ''
run: |
echo "No modified targets found"

View File

@ -2,7 +2,7 @@
# 1. Update the version tag in the Dockerfile to match the version in sherlock/__init__.py # 1. Update the version tag in the Dockerfile to match the version in sherlock/__init__.py
# 2. Update the VCS_REF tag to match the tagged version's FULL commit hash # 2. Update the VCS_REF tag to match the tagged version's FULL commit hash
# 3. Build image with BOTH latest and version tags # 3. Build image with BOTH latest and version tags
# i.e. `docker build -t sherlock/sherlock:0.15.0 -t sherlock/sherlock:latest .` # i.e. `docker build -t sherlock/sherlock:0.16.0 -t sherlock/sherlock:latest .`
FROM python:3.12-slim-bullseye as build FROM python:3.12-slim-bullseye as build
WORKDIR /sherlock WORKDIR /sherlock

View File

@ -0,0 +1,72 @@
#!/usr/bin/env python
# This module summarizes the results of site validation tests queued by
# workflow validate_modified_targets for presentation in Issue comments.
from defusedxml import ElementTree as ET
import sys
from pathlib import Path
def summarize_junit_xml(xml_path: Path) -> str:
tree = ET.parse(xml_path)
root = tree.getroot()
suite = root.find('testsuite')
pass_message: str = ":heavy_check_mark: &nbsp; Pass"
fail_message: str = ":x: &nbsp; Fail"
if suite is None:
raise ValueError("Invalid JUnit XML: No testsuite found")
summary_lines: list[str] = []
summary_lines.append("#### Automatic validation of changes\n")
summary_lines.append("| Target | F+ Check | F- Check |")
summary_lines.append("|---|---|---|")
failures = int(suite.get('failures', 0))
errors_detected: bool = False
results: dict[str, dict[str, str]] = {}
for testcase in suite.findall('testcase'):
test_name = testcase.get('name').split('[')[0]
site_name = testcase.get('name').split('[')[1].rstrip(']')
failure = testcase.find('failure')
error = testcase.find('error')
if site_name not in results:
results[site_name] = {}
if test_name == "test_false_neg":
results[site_name]['F- Check'] = pass_message if failure is None and error is None else fail_message
elif test_name == "test_false_pos":
results[site_name]['F+ Check'] = pass_message if failure is None and error is None else fail_message
if error is not None:
errors_detected = True
for result in results:
summary_lines.append(f"| {result} | {results[result].get('F+ Check', 'Error!')} | {results[result].get('F- Check', 'Error!')} |")
if failures > 0:
summary_lines.append("\n___\n" +
"\nFailures were detected on at least one updated target. Commits containing accuracy failures" +
" will often not be merged (unless a rationale is provided, such as false negatives due to regional differences).")
if errors_detected:
summary_lines.append("\n___\n" +
"\n**Errors were detected during validation. Please review the workflow logs.**")
return "\n".join(summary_lines)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: summarize_site_validation.py <junit-xml-file>")
sys.exit(1)
xml_path: Path = Path(sys.argv[1])
if not xml_path.is_file():
print(f"Error: File '{xml_path}' does not exist.")
sys.exit(1)
summary: str = summarize_junit_xml(xml_path)
print(summary)

View File

@ -1,6 +1,6 @@
<p align=center> <p align="center">
<br> <br>
<a href="https://sherlock-project.github.io/" target="_blank"><img src="images/sherlock-logo.png"/></a> <a href="https://sherlock-project.github.io/" target="_blank"><img src="images/sherlock-logo.png" alt="sherlock"/></a>
<br> <br>
<span>Hunt down social media accounts by username across <a href="https://sherlockproject.xyz/sites">400+ social networks</a></span> <span>Hunt down social media accounts by username across <a href="https://sherlockproject.xyz/sites">400+ social networks</a></span>
<br> <br>
@ -15,8 +15,7 @@
</p> </p>
<p align="center"> <p align="center">
<img width="70%" height="70%" src="images/demo.png"/> <img width="70%" height="70%" src="images/demo.png" alt="demo"/>
</a>
</p> </p>
@ -112,17 +111,17 @@ $ echo '{"usernames":["user123"]}' | apify call -so netmilk/sherlock
"https://www.1337x.to/user/user123/", "https://www.1337x.to/user/user123/",
... ...
] ]
}]s }]
``` ```
Read more about the [Sherlock Actor](../.actor/README.md), including how to use it programmaticaly via the Apify [API](https://apify.com/netmilk/sherlock/api?fpr=sherlock), [CLI](https://docs.apify.com/cli/?fpr=sherlock) and [JS/TS and Python SDKs](https://docs.apify.com/sdk?fpr=sherlock). Read more about the [Sherlock Actor](../.actor/README.md), including how to use it programmatically via the Apify [API](https://apify.com/netmilk/sherlock/api?fpr=sherlock), [CLI](https://docs.apify.com/cli/?fpr=sherlock) and [JS/TS and Python SDKs](https://docs.apify.com/sdk?fpr=sherlock).
## Credits ## Credits
Thank you to everyone who has contributed to Sherlock! ❤️ Thank you to everyone who has contributed to Sherlock! ❤️
<a href="https://github.com/sherlock-project/sherlock/graphs/contributors"> <a href="https://github.com/sherlock-project/sherlock/graphs/contributors">
<img src="https://contrib.rocks/image?&columns=25&max=10000&&repo=sherlock-project/sherlock" noZoom /> <img src="https://contrib.rocks/image?&columns=25&max=10000&&repo=sherlock-project/sherlock" alt="contributors"/>
</a> </a>
## Star history ## Star history

View File

@ -1982,3 +1982,16 @@ __2025-02-16 :__ Unsure if any way to view profiles exists now
"username_claimed": "t3dotgg" "username_claimed": "t3dotgg"
} }
``` ```
## TorrentGalaxy
__2025-07-06 :__ Site appears to have gone offline in March and hasn't come back
```json
"TorrentGalaxy": {
"errorMsg": "<title>TGx:Can't show details</title>",
"errorType": "message",
"regexCheck": "^[A-Za-z0-9]{3,15}$",
"url": "https://torrentgalaxy.to/profile/{}",
"urlMain": "https://torrentgalaxy.to/",
"username_claimed": "GalaxyRG"
},
```

View File

@ -8,8 +8,7 @@ source = "init"
[tool.poetry] [tool.poetry]
name = "sherlock-project" name = "sherlock-project"
# single source of truth for version is __init__.py version = "0.16.0"
version = "0"
description = "Hunt down social media accounts by username across social networks" description = "Hunt down social media accounts by username across social networks"
license = "MIT" license = "MIT"
authors = [ authors = [
@ -50,12 +49,20 @@ stem = "^1.8.0"
torrequest = "^0.1.0" torrequest = "^0.1.0"
pandas = "^2.2.1" pandas = "^2.2.1"
openpyxl = "^3.0.10" openpyxl = "^3.0.10"
tomli = "^2.2.1"
[tool.poetry.extras] [tool.poetry.extras]
tor = ["torrequest"] tor = ["torrequest"]
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
jsonschema = "^4.0.0" jsonschema = "^4.0.0"
rstr = "^3.2.2"
pytest = "^8.4.2"
pytest-xdist = "^3.8.0"
[tool.poetry.group.ci.dependencies]
defusedxml = "^0.7.1"
[tool.poetry.scripts] [tool.poetry.scripts]
sherlock = 'sherlock_project.sherlock:main' sherlock = 'sherlock_project.sherlock:main'

View File

@ -1,4 +1,7 @@
[pytest] [pytest]
addopts = --strict-markers addopts = --strict-markers -m "not validate_targets"
markers = markers =
online: mark tests are requiring internet access. online: mark tests are requiring internet access.
validate_targets: mark tests for sweeping manifest validation (sends many requests).
validate_targets_fp: validate_targets, false positive tests only.
validate_targets_fn: validate_targets, false negative tests only.

View File

@ -5,11 +5,26 @@ networks.
""" """
from importlib.metadata import version as pkg_version, PackageNotFoundError
import pathlib
import tomli
def get_version() -> str:
"""Fetch the version number of the installed package."""
try:
return pkg_version("sherlock_project")
except PackageNotFoundError:
pyproject_path: pathlib.Path = pathlib.Path(__file__).resolve().parent.parent / "pyproject.toml"
with pyproject_path.open("rb") as f:
pyproject_data = tomli.load(f)
return pyproject_data["tool"]["poetry"]["version"]
# This variable is only used to check for ImportErrors induced by users running as script rather than as module or package # This variable is only used to check for ImportErrors induced by users running as script rather than as module or package
import_error_test_var = None import_error_test_var = None
__shortname__ = "Sherlock" __shortname__ = "Sherlock"
__longname__ = "Sherlock: Find Usernames Across Social Networks" __longname__ = "Sherlock: Find Usernames Across Social Networks"
__version__ = "0.15.0" __version__ = get_version()
forge_api_latest_release = "https://api.github.com/repos/sherlock-project/sherlock/releases/latest" forge_api_latest_release = "https://api.github.com/repos/sherlock-project/sherlock/releases/latest"

View File

@ -258,6 +258,12 @@
"urlMain": "https://www.blipfoto.com/", "urlMain": "https://www.blipfoto.com/",
"username_claimed": "blue" "username_claimed": "blue"
}, },
"Blitz Tactics": {
"errorType": "status_code",
"url": "https://blitztactics.com/{}",
"urlMain": "https://blitztactics.com/",
"username_claimed": "Lance5500"
},
"Blogger": { "Blogger": {
"errorType": "status_code", "errorType": "status_code",
"regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$",
@ -265,6 +271,13 @@
"urlMain": "https://www.blogger.com/", "urlMain": "https://www.blogger.com/",
"username_claimed": "blue" "username_claimed": "blue"
}, },
"Bluesky": {
"errorType": "status_code",
"url": "https://bsky.app/profile/{}.bsky.social",
"urlProbe": "https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile?actor={}.bsky.social",
"urlMain": "https://bsky.app/",
"username_claimed": "mcuban"
},
"BoardGameGeek": { "BoardGameGeek": {
"errorType": "message", "errorType": "message",
"regexCheck": "^[a-zA-Z0-9_]*$", "regexCheck": "^[a-zA-Z0-9_]*$",
@ -365,6 +378,12 @@
"urlMain": "https://career.habr.com/", "urlMain": "https://career.habr.com/",
"username_claimed": "blue" "username_claimed": "blue"
}, },
"CashApp": {
"errorType": "status_code",
"url": "https://cash.app/${}",
"urlMain": "https://cash.app",
"username_claimed": "hotdiggitydog"
},
"Championat": { "Championat": {
"errorType": "status_code", "errorType": "status_code",
"url": "https://www.championat.com/user/{}", "url": "https://www.championat.com/user/{}",
@ -603,7 +622,7 @@
"urlMain": "https://forums.digitalspy.com/", "urlMain": "https://forums.digitalspy.com/",
"username_claimed": "blue", "username_claimed": "blue",
"regexCheck": "^\\w{3,20}$" "regexCheck": "^\\w{3,20}$"
}, },
"Discogs": { "Discogs": {
"errorType": "status_code", "errorType": "status_code",
"url": "https://www.discogs.com/user/{}", "url": "https://www.discogs.com/user/{}",
@ -789,13 +808,12 @@
"urlMain": "https://fosstodon.org/", "urlMain": "https://fosstodon.org/",
"username_claimed": "blue" "username_claimed": "blue"
}, },
"Freelance.habr": { "Framapiaf": {
"errorMsg": "<div class=\"icon_user_locked\"></div>", "errorType": "status_code",
"errorType": "message", "regexCheck": "^[a-zA-Z0-9_]{1,30}$",
"regexCheck": "^((?!\\.).)*$", "url": "https://framapiaf.org/@{}",
"url": "https://freelance.habr.com/freelancers/{}", "urlMain": "https://framapiaf.org",
"urlMain": "https://freelance.habr.com/", "username_claimed": "pylapp"
"username_claimed": "adam"
}, },
"Freelancer": { "Freelancer": {
"errorMsg": "\"users\":{}", "errorMsg": "\"users\":{}",
@ -1124,6 +1142,13 @@
"urlProbe": "https://imginn.com/{}", "urlProbe": "https://imginn.com/{}",
"username_claimed": "instagram" "username_claimed": "instagram"
}, },
"Instapaper": {
"errorType": "status_code",
"request_method": "GET",
"url": "https://www.instapaper.com/p/{}",
"urlMain": "https://www.instapaper.com/",
"username_claimed": "john"
},
"Instructables": { "Instructables": {
"errorType": "status_code", "errorType": "status_code",
"url": "https://www.instructables.com/member/{}", "url": "https://www.instructables.com/member/{}",
@ -1236,6 +1261,13 @@
"urlMain": "https://linux.org.ru/", "urlMain": "https://linux.org.ru/",
"username_claimed": "red" "username_claimed": "red"
}, },
"Laracast": {
"errorType":"status_code",
"url": "https://laracasts.com/@{}",
"urlMain": "https://laracasts.com/",
"regexCheck": "^[a-zA-Z0-9_-]{3,}$",
"username_claimed": "user1"
},
"Launchpad": { "Launchpad": {
"errorType": "status_code", "errorType": "status_code",
"url": "https://launchpad.net/~{}", "url": "https://launchpad.net/~{}",
@ -1279,7 +1311,6 @@
}, },
"LinkedIn": { "LinkedIn": {
"errorType": "status_code", "errorType": "status_code",
"regexCheck": "^[a-zA-Z0-9]{3,100}$", "regexCheck": "^[a-zA-Z0-9]{3,100}$",
"request_method": "GET", "request_method": "GET",
"url": "https://linkedin.com/in/{}", "url": "https://linkedin.com/in/{}",
@ -1294,6 +1325,12 @@
"urlMain": "https://linktr.ee/", "urlMain": "https://linktr.ee/",
"username_claimed": "anne" "username_claimed": "anne"
}, },
"LinuxFR.org": {
"errorType": "status_code",
"url": "https://linuxfr.org/users/{}",
"urlMain": "https://linuxfr.org/",
"username_claimed": "pylapp"
},
"Listed": { "Listed": {
"errorType": "response_url", "errorType": "response_url",
"errorUrl": "https://listed.to/@{}", "errorUrl": "https://listed.to/@{}",
@ -1334,6 +1371,13 @@
"urlMain": "https://forums.mmorpg.com/", "urlMain": "https://forums.mmorpg.com/",
"username_claimed": "goku" "username_claimed": "goku"
}, },
"Mamot": {
"errorType": "status_code",
"regexCheck": "^[a-zA-Z0-9_]{1,30}$",
"url": "https://mamot.fr/@{}",
"urlMain": "https://mamot.fr/",
"username_claimed": "anciensEnssat"
},
"Medium": { "Medium": {
"errorMsg": "<body", "errorMsg": "<body",
"errorType": "message", "errorType": "message",
@ -1349,8 +1393,8 @@
"username_claimed": "blue" "username_claimed": "blue"
}, },
"Minecraft": { "Minecraft": {
"errorCode": 204, "errorMsg": "Couldn't find any profile with name",
"errorType": "status_code", "errorType": "message",
"url": "https://api.mojang.com/users/profiles/minecraft/{}", "url": "https://api.mojang.com/users/profiles/minecraft/{}",
"urlMain": "https://minecraft.net/", "urlMain": "https://minecraft.net/",
"username_claimed": "blue" "username_claimed": "blue"
@ -1495,6 +1539,13 @@
"urlMain": "https://nyaa.si/", "urlMain": "https://nyaa.si/",
"username_claimed": "blue" "username_claimed": "blue"
}, },
"Open Collective": {
"errorMsg": "Oops! Page not found",
"errorType": "message",
"url": "https://opencollective.com/{}",
"urlMain": "https://opencollective.com/",
"username_claimed": "pylapp"
},
"OpenStreetMap": { "OpenStreetMap": {
"errorType": "status_code", "errorType": "status_code",
"regexCheck": "^[^.]*?$", "regexCheck": "^[^.]*?$",
@ -1515,6 +1566,13 @@
"urlMain": "https://ourdjtalk.com/", "urlMain": "https://ourdjtalk.com/",
"username_claimed": "steve" "username_claimed": "steve"
}, },
"Outgress": {
"errorMsg": "Outgress - Error",
"errorType": "message",
"url": "https://outgress.com/agents/{}",
"urlMain": "https://outgress.com/",
"username_claimed": "pylapp"
},
"PCGamer": { "PCGamer": {
"errorMsg": "The specified member cannot be found. Please enter a member's entire name.", "errorMsg": "The specified member cannot be found. Please enter a member's entire name.",
"errorType": "message", "errorType": "message",
@ -1576,12 +1634,31 @@
"urlMain": "https://www.pinkbike.com/", "urlMain": "https://www.pinkbike.com/",
"username_claimed": "blue" "username_claimed": "blue"
}, },
"pixelfed.social": {
"errorType": "status_code",
"url": "https://pixelfed.social/{}/",
"urlMain": "https://pixelfed.social",
"username_claimed": "pylapp"
},
"PlayStore": { "PlayStore": {
"errorType": "status_code", "errorType": "status_code",
"url": "https://play.google.com/store/apps/developer?id={}", "url": "https://play.google.com/store/apps/developer?id={}",
"urlMain": "https://play.google.com/store", "urlMain": "https://play.google.com/store",
"username_claimed": "Facebook" "username_claimed": "Facebook"
}, },
"Playstrategy": {
"errorType": "status_code",
"url": "https://playstrategy.org/@/{}",
"urlMain": "https://playstrategy.org",
"username_claimed": "oruro"
},
"Plurk": {
"errorMsg": "User Not Found!",
"errorType": "message",
"url": "https://www.plurk.com/{}",
"urlMain": "https://www.plurk.com/",
"username_claimed": "plurkoffice"
},
"PocketStars": { "PocketStars": {
"errorMsg": "Join Your Favorite Adult Stars", "errorMsg": "Join Your Favorite Adult Stars",
"errorType": "message", "errorType": "message",
@ -1629,6 +1706,20 @@
"urlMain": "https://www.producthunt.com/", "urlMain": "https://www.producthunt.com/",
"username_claimed": "jenny" "username_claimed": "jenny"
}, },
"programming.dev": {
"errorMsg": "Error!",
"errorType": "message",
"url": "https://programming.dev/u/{}",
"urlMain": "https://programming.dev",
"username_claimed": "pylapp"
},
"Pychess": {
"errorType": "message",
"errorMsg": "404",
"url": "https://www.pychess.org/@/{}",
"urlMain": "https://www.pychess.org",
"username_claimed": "gbtami"
},
"PromoDJ": { "PromoDJ": {
"errorType": "status_code", "errorType": "status_code",
"url": "http://promodj.com/{}", "url": "http://promodj.com/{}",
@ -1880,6 +1971,12 @@
"urlMain": "https://soylentnews.org", "urlMain": "https://soylentnews.org",
"username_claimed": "adam" "username_claimed": "adam"
}, },
"SpeakerDeck": {
"errorType": "status_code",
"url": "https://speakerdeck.com/{}",
"urlMain": "https://speakerdeck.com/",
"username_claimed": "pylapp"
},
"Speedrun.com": { "Speedrun.com": {
"errorType": "status_code", "errorType": "status_code",
"url": "https://speedrun.com/users/{}", "url": "https://speedrun.com/users/{}",
@ -2029,14 +2126,6 @@
"urlMain": "https://www.tnaflix.com/", "urlMain": "https://www.tnaflix.com/",
"username_claimed": "hacker" "username_claimed": "hacker"
}, },
"TorrentGalaxy": {
"errorMsg": "<title>TGx:Can't show details</title>",
"errorType": "message",
"regexCheck": "^[A-Za-z0-9]{3,15}$",
"url": "https://torrentgalaxy.to/profile/{}",
"urlMain": "https://torrentgalaxy.to/",
"username_claimed": "GalaxyRG"
},
"TradingView": { "TradingView": {
"errorType": "status_code", "errorType": "status_code",
"request_method": "GET", "request_method": "GET",
@ -2719,12 +2808,24 @@
"urlMain": "https://www.toster.ru/", "urlMain": "https://www.toster.ru/",
"username_claimed": "adam" "username_claimed": "adam"
}, },
"tumblr": {
"errorType": "status_code",
"url": "https://{}.tumblr.com/",
"urlMain": "https://www.tumblr.com/",
"username_claimed": "goku"
},
"uid": { "uid": {
"errorType": "status_code", "errorType": "status_code",
"url": "http://uid.me/{}", "url": "http://uid.me/{}",
"urlMain": "https://uid.me/", "urlMain": "https://uid.me/",
"username_claimed": "blue" "username_claimed": "blue"
}, },
"write.as": {
"errorType": "status_code",
"url": "https://write.as/{}",
"urlMain": "https://write.as",
"username_claimed": "pylapp"
},
"xHamster": { "xHamster": {
"errorType": "status_code", "errorType": "status_code",
"isNSFW": true, "isNSFW": true,
@ -2745,5 +2846,13 @@
"urlProbe": "https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile?actor={}.bsky.social", "urlProbe": "https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile?actor={}.bsky.social",
"urlMain": "https://bsky.app/", "urlMain": "https://bsky.app/",
"username_claimed": "mcuban" "username_claimed": "mcuban"
},
"Platzi": {
"errorType": "status_code",
"errorCode": 404,
"url": "https://platzi.com/p/{}/",
"urlMain": "https://platzi.com/",
"username_claimed": "freddier",
"request_method": "GET"
} }
} }

View File

@ -169,14 +169,14 @@ def multiple_usernames(username):
def sherlock( def sherlock(
username: str, username: str,
site_data: dict, site_data: dict[str, dict[str, str]],
query_notify: QueryNotify, query_notify: QueryNotify,
tor: bool = False, tor: bool = False,
unique_tor: bool = False, unique_tor: bool = False,
dump_response: bool = False, dump_response: bool = False,
proxy: Optional[str] = None, proxy: Optional[str] = None,
timeout: int = 60, timeout: int = 60,
): ) -> dict[str, dict[str, str | QueryResult]]:
"""Run Sherlock Analysis. """Run Sherlock Analysis.
Checks for existence of username on various social media sites. Checks for existence of username on various social media sites.
@ -507,7 +507,7 @@ def sherlock(
print("+++++++++++++++++++++") print("+++++++++++++++++++++")
# Notify caller about results of query. # Notify caller about results of query.
result = QueryResult( result: QueryResult = QueryResult(
username=username, username=username,
site_name=social_network, site_name=social_network,
site_url_user=url, site_url_user=url,
@ -727,6 +727,14 @@ def main():
help="Disable creation of a txt file", help="Disable creation of a txt file",
) )
parser.add_argument(
"--ignore-exclusions",
action="store_true",
dest="ignore_exclusions",
default=False,
help="Ignore upstream exclusions (may return more false positives)",
)
args = parser.parse_args() args = parser.parse_args()
# If the user presses CTRL-C, exit gracefully without throwing errors # If the user presses CTRL-C, exit gracefully without throwing errors
@ -784,7 +792,8 @@ def main():
try: try:
if args.local: if args.local:
sites = SitesInformation( sites = SitesInformation(
os.path.join(os.path.dirname(__file__), "resources/data.json") os.path.join(os.path.dirname(__file__), "resources/data.json"),
honor_exclusions=False,
) )
else: else:
json_file_location = args.json_file json_file_location = args.json_file
@ -804,7 +813,11 @@ def main():
head_commit_sha = pull_request_json["head"]["sha"] head_commit_sha = pull_request_json["head"]["sha"]
json_file_location = f"https://raw.githubusercontent.com/sherlock-project/sherlock/{head_commit_sha}/sherlock_project/resources/data.json" json_file_location = f"https://raw.githubusercontent.com/sherlock-project/sherlock/{head_commit_sha}/sherlock_project/resources/data.json"
sites = SitesInformation(json_file_location) sites = SitesInformation(
data_file_path=json_file_location,
honor_exclusions=not args.ignore_exclusions,
do_not_exclude=args.site_list,
)
except Exception as error: except Exception as error:
print(f"ERROR: {error}") print(f"ERROR: {error}")
sys.exit(1) sys.exit(1)

View File

@ -7,6 +7,10 @@ import json
import requests import requests
import secrets import secrets
MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt"
class SiteInformation: class SiteInformation:
def __init__(self, name, url_home, url_username_format, username_claimed, def __init__(self, name, url_home, url_username_format, username_claimed,
information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)): information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)):
@ -67,12 +71,17 @@ class SiteInformation:
Return Value: Return Value:
Nicely formatted string to get information about this object. Nicely formatted string to get information about this object.
""" """
return f"{self.name} ({self.url_home})" return f"{self.name} ({self.url_home})"
class SitesInformation: class SitesInformation:
def __init__(self, data_file_path=None): def __init__(
self,
data_file_path: str|None = None,
honor_exclusions: bool = True,
do_not_exclude: list[str] = [],
):
"""Create Sites Information Object. """Create Sites Information Object.
Contains information about all supported websites. Contains information about all supported websites.
@ -110,7 +119,7 @@ class SitesInformation:
# The default data file is the live data.json which is in the GitHub repo. The reason why we are using # The default data file is the live data.json which is in the GitHub repo. The reason why we are using
# this instead of the local one is so that the user has the most up-to-date data. This prevents # this instead of the local one is so that the user has the most up-to-date data. This prevents
# users from creating issue about false positives which has already been fixed or having outdated data # users from creating issue about false positives which has already been fixed or having outdated data
data_file_path = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json" data_file_path = MANIFEST_URL
# Ensure that specified data file has correct extension. # Ensure that specified data file has correct extension.
if not data_file_path.lower().endswith(".json"): if not data_file_path.lower().endswith(".json"):
@ -152,9 +161,31 @@ class SitesInformation:
raise FileNotFoundError(f"Problem while attempting to access " raise FileNotFoundError(f"Problem while attempting to access "
f"data file '{data_file_path}'." f"data file '{data_file_path}'."
) )
site_data.pop('$schema', None) site_data.pop('$schema', None)
if honor_exclusions:
try:
response = requests.get(url=EXCLUSIONS_URL)
if response.status_code == 200:
exclusions = response.text.splitlines()
exclusions = [exclusion.strip() for exclusion in exclusions]
for site in do_not_exclude:
if site in exclusions:
exclusions.remove(site)
for exclusion in exclusions:
try:
site_data.pop(exclusion, None)
except KeyError:
pass
except Exception:
# If there was any problem loading the exclusions, just continue without them
print("Warning: Could not load exclusions, continuing without them.")
honor_exclusions = False
self.sites = {} self.sites = {}
# Add all site information from the json file to internal site list. # Add all site information from the json file to internal site list.
@ -194,7 +225,7 @@ class SitesInformation:
for site in self.sites: for site in self.sites:
if self.sites[site].is_nsfw and site.casefold() not in do_not_remove: if self.sites[site].is_nsfw and site.casefold() not in do_not_remove:
continue continue
sites[site] = self.sites[site] sites[site] = self.sites[site]
self.sites = sites self.sites = sites
def site_name_list(self): def site_name_list(self):

View File

@ -4,6 +4,11 @@ import urllib
import pytest import pytest
from sherlock_project.sites import SitesInformation from sherlock_project.sites import SitesInformation
def fetch_local_manifest(honor_exclusions: bool = True) -> dict[str, dict[str, str]]:
sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"), honor_exclusions=honor_exclusions)
sites_iterable: dict[str, dict[str, str]] = {site.name: site.information for site in sites_obj}
return sites_iterable
@pytest.fixture() @pytest.fixture()
def sites_obj(): def sites_obj():
sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json")) sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
@ -11,9 +16,7 @@ def sites_obj():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def sites_info(): def sites_info():
sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json")) yield fetch_local_manifest()
sites_iterable = {site.name: site.information for site in sites_obj}
yield sites_iterable
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def remote_schema(): def remote_schema():
@ -21,3 +24,28 @@ def remote_schema():
with urllib.request.urlopen(schema_url) as remoteschema: with urllib.request.urlopen(schema_url) as remoteschema:
schemadat = json.load(remoteschema) schemadat = json.load(remoteschema)
yield schemadat yield schemadat
def pytest_addoption(parser):
parser.addoption(
"--chunked-sites",
action="store",
default=None,
help="For tests utilizing chunked sites, include only the (comma-separated) site(s) specified.",
)
def pytest_generate_tests(metafunc):
if "chunked_sites" in metafunc.fixturenames:
sites_info = fetch_local_manifest(honor_exclusions=False)
# Ingest and apply site selections
site_filter: str | None = metafunc.config.getoption("--chunked-sites")
if site_filter:
selected_sites: list[str] = [site.strip() for site in site_filter.split(",")]
sites_info = {
site: data for site, data in sites_info.items()
if site in selected_sites
}
params = [{name: data} for name, data in sites_info.items()]
ids = list(sites_info.keys())
metafunc.parametrize("chunked_sites", params, ids=ids)

View File

@ -7,7 +7,7 @@ def test_validate_manifest_against_local_schema():
"""Ensures that the manifest matches the local schema, for situations where the schema is being changed.""" """Ensures that the manifest matches the local schema, for situations where the schema is being changed."""
json_relative: str = '../sherlock_project/resources/data.json' json_relative: str = '../sherlock_project/resources/data.json'
schema_relative: str = '../sherlock_project/resources/data.schema.json' schema_relative: str = '../sherlock_project/resources/data.schema.json'
json_path: str = os.path.join(os.path.dirname(__file__), json_relative) json_path: str = os.path.join(os.path.dirname(__file__), json_relative)
schema_path: str = os.path.join(os.path.dirname(__file__), schema_relative) schema_path: str = os.path.join(os.path.dirname(__file__), schema_relative)

View File

@ -0,0 +1,99 @@
import pytest
import re
import rstr
from sherlock_project.sherlock import sherlock
from sherlock_project.notify import QueryNotify
from sherlock_project.result import QueryResult, QueryStatus
FALSE_POSITIVE_ATTEMPTS: int = 2 # Since the usernames are randomly generated, it's POSSIBLE that a real username can be hit
FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND: int = 15 # If a pattern uses quantifiers such as `+` `*` or `{n,}`, limit the upper bound (0 to disable)
FALSE_POSITIVE_DEFAULT_PATTERN: str = r'^[a-zA-Z0-9]{7,20}$' # Used in absence of a regexCheck entry
def set_pattern_upper_bound(pattern: str, upper_bound: int = FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND) -> str:
"""Set upper bound for regex patterns that use quantifiers such as `+` `*` or `{n,}`."""
def replace_upper_bound(match: re.Match) -> str: # type: ignore
lower_bound: int = int(match.group(1)) if match.group(1) else 0 # type: ignore
upper_bound = upper_bound if lower_bound < upper_bound else lower_bound # type: ignore # noqa: F823
return f'{{{lower_bound},{upper_bound}}}'
pattern = re.sub(r'(?<!\\)\{(\d+),\}', replace_upper_bound, pattern) # {n,} # type: ignore
pattern = re.sub(r'(?<!\\)\+', f'{{1,{upper_bound}}}', pattern) # +
pattern = re.sub(r'(?<!\\)\*', f'{{0,{upper_bound}}}', pattern) # *
return pattern
def false_positive_check(sites_info: dict[str, dict[str, str]], site: str, pattern: str) -> QueryStatus:
"""Check if a site is likely to produce false positives."""
status: QueryStatus = QueryStatus.UNKNOWN
for _ in range(FALSE_POSITIVE_ATTEMPTS):
query_notify: QueryNotify = QueryNotify()
username: str = rstr.xeger(pattern)
result: QueryResult | str = sherlock(
username=username,
site_data=sites_info,
query_notify=query_notify,
)[site]['status']
if not hasattr(result, 'status'):
raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}")
if type(result.status) is not QueryStatus: # type: ignore
raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore
status = result.status # type: ignore
if status in (QueryStatus.AVAILABLE, QueryStatus.WAF):
return status
return status
def false_negative_check(sites_info: dict[str, dict[str, str]], site: str) -> QueryStatus:
"""Check if a site is likely to produce false negatives."""
status: QueryStatus = QueryStatus.UNKNOWN
query_notify: QueryNotify = QueryNotify()
result: QueryResult | str = sherlock(
username=sites_info[site]['username_claimed'],
site_data=sites_info,
query_notify=query_notify,
)[site]['status']
if not hasattr(result, 'status'):
raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}")
if type(result.status) is not QueryStatus: # type: ignore
raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore
status = result.status # type: ignore
return status
@pytest.mark.validate_targets
@pytest.mark.online
class Test_All_Targets:
@pytest.mark.validate_targets_fp
def test_false_pos(self, chunked_sites: dict[str, dict[str, str]]):
"""Iterate through all sites in the manifest to discover possible false-positive inducting targets."""
pattern: str
for site in chunked_sites:
try:
pattern = chunked_sites[site]['regexCheck']
except KeyError:
pattern = FALSE_POSITIVE_DEFAULT_PATTERN
if FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND > 0:
pattern = set_pattern_upper_bound(pattern)
result: QueryStatus = false_positive_check(chunked_sites, site, pattern)
assert result is QueryStatus.AVAILABLE, f"{site} produced false positive with pattern {pattern}, result was {result}"
@pytest.mark.validate_targets_fn
def test_false_neg(self, chunked_sites: dict[str, dict[str, str]]):
"""Iterate through all sites in the manifest to discover possible false-negative inducting targets."""
for site in chunked_sites:
result: QueryStatus = false_negative_check(chunked_sites, site)
assert result is QueryStatus.CLAIMED, f"{site} produced false negative, result was {result}"

View File

@ -7,8 +7,6 @@ envlist =
py312 py312
py311 py311
py310 py310
py39
py38
[testenv] [testenv]
description = Attempt to build and install the package description = Attempt to build and install the package
@ -16,6 +14,7 @@ deps =
coverage coverage
jsonschema jsonschema
pytest pytest
rstr
allowlist_externals = coverage allowlist_externals = coverage
commands = commands =
coverage run --source=sherlock_project --module pytest -v coverage run --source=sherlock_project --module pytest -v
@ -37,7 +36,7 @@ commands =
[gh-actions] [gh-actions]
python = python =
3.13: py313
3.12: py312 3.12: py312
3.11: py311 3.11: py311
3.10: py310 3.10: py310
3.9: py39