chore: add error message to the codesandbox

This commit is contained in:
manjushsh 2025-10-05 15:22:37 +05:30
commit 738df6c362
19 changed files with 941 additions and 289 deletions

2
.github/CODEOWNERS vendored
View File

@ -1,5 +1,5 @@
### REPOSITORY
/.github/CODEOWNERS @sdushantha
/.github/CODEOWNERS @sdushantha @ppfeister
/.github/FUNDING.yml @sdushantha
/LICENSE @sdushantha

89
.github/workflows/exclusions.yml vendored Normal file
View File

@ -0,0 +1,89 @@
name: Exclusions Updater
on:
schedule:
#- cron: '0 5 * * 0' # Runs at 05:00 every Sunday
- cron: '0 5 * * *' # Runs at 05:00 every day
workflow_dispatch:
jobs:
update-exclusions:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v5
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.13'
- name: Install Poetry
uses: abatilo/actions-poetry@v4
with:
poetry-version: 'latest'
- name: Install dependencies
run: |
poetry install --no-interaction --with dev
- name: Run false positive tests
run: |
$(poetry env activate)
pytest -q --tb no -m validate_targets_fp -n 20 | tee fp_test_results.txt
deactivate
- name: Parse false positive detections by desired categories
run: |
grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was Claimed)' fp_test_results.txt \
| sort -u > false_positive_exclusions.txt
grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was WAF)' fp_test_results.txt \
| sort -u > waf_hits.txt
- name: Detect if exclusions list changed
id: detect_changes
run: |
git fetch origin exclusions || true
if git show origin/exclusions:false_positive_exclusions.txt >/dev/null 2>&1; then
# If the exclusions branch and file exist, compare
if git diff --quiet origin/exclusions -- false_positive_exclusions.txt; then
echo "exclusions_changed=false" >> "$GITHUB_OUTPUT"
else
echo "exclusions_changed=true" >> "$GITHUB_OUTPUT"
fi
else
# If the exclusions branch or file do not exist, treat as changed
echo "exclusions_changed=true" >> "$GITHUB_OUTPUT"
fi
- name: Quantify and display results
run: |
FP_COUNT=$(wc -l < false_positive_exclusions.txt | xargs)
WAF_COUNT=$(wc -l < waf_hits.txt | xargs)
echo ">>> Found $FP_COUNT false positives and $WAF_COUNT WAF hits."
echo ">>> False positive exclusions:" && cat false_positive_exclusions.txt
echo ">>> WAF hits:" && cat waf_hits.txt
- name: Commit and push exclusions list
if: steps.detect_changes.outputs.exclusions_changed == 'true'
run: |
git config user.name "Paul Pfeister (automation)"
git config user.email "code@pfeister.dev"
mv false_positive_exclusions.txt false_positive_exclusions.txt.tmp
git add -f false_positive_exclusions.txt.tmp # -f required to override .gitignore
git stash push -m "stash false positive exclusion list" -- false_positive_exclusions.txt.tmp
git fetch origin exclusions || true # Allows creation of branch if deleted
git checkout -B exclusions origin/exclusions || (git checkout --orphan exclusions && git rm -rf .)
git stash pop || true
mv false_positive_exclusions.txt.tmp false_positive_exclusions.txt
git rm -f false_positive_exclusions.txt.tmp || true
git add false_positive_exclusions.txt
git commit -m "auto: update exclusions list" || echo "No changes to commit"
git push origin exclusions

View File

@ -11,6 +11,7 @@ on:
- '**/*.py'
- '**/*.ini'
- '**/*.toml'
- 'Dockerfile'
push:
branches:
- master
@ -21,11 +22,13 @@ on:
- '**/*.py'
- '**/*.ini'
- '**/*.toml'
- 'Dockerfile'
jobs:
tox-lint:
# Linting is ran through tox to ensure that the same linter is used by local runners
runs-on: ubuntu-latest
# Linting is ran through tox to ensure that the same linter
# is used by local runners
steps:
- uses: actions/checkout@v4
- name: Set up linting environment
@ -41,7 +44,8 @@ jobs:
tox-matrix:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false # We want to know what specicic versions it fails on
# We want to know what specicic versions it fails on
fail-fast: false
matrix:
os: [
ubuntu-latest,
@ -49,10 +53,10 @@ jobs:
macos-latest,
]
python-version: [
'3.9',
'3.10',
'3.11',
'3.12',
'3.13',
]
steps:
- uses: actions/checkout@v4
@ -67,3 +71,22 @@ jobs:
pip install tox-gh-actions
- name: Run tox
run: tox
docker-build-test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Get version from pyproject.toml
id: get-version
run: |
VERSION=$(grep -m1 'version = ' pyproject.toml | cut -d'"' -f2)
echo "version=$VERSION" >> $GITHUB_OUTPUT
- name: Build Docker image
run: |
docker build \
--build-arg VERSION_TAG=${{ steps.get-version.outputs.version }} \
-t sherlock-test:latest .
- name: Test Docker image runs
run: docker run --rm sherlock-test:latest --version

View File

@ -0,0 +1,100 @@
name: Modified Target Validation
on:
pull_request_target:
branches:
- master
paths:
- "sherlock_project/resources/data.json"
jobs:
validate-modified-targets:
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
ref: ${{ github.base_ref }}
fetch-depth: 1
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.13'
- name: Install Poetry
uses: abatilo/actions-poetry@v4
with:
poetry-version: 'latest'
- name: Install dependencies
run: |
poetry install --no-interaction --with dev
- name: Drop in place updated manifest from base
run: |
cp sherlock_project/resources/data.json data.json.base
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr --depth=1
git show pr:sherlock_project/resources/data.json > sherlock_project/resources/data.json
cp sherlock_project/resources/data.json data.json.head
- name: Discover modified targets
id: discover-modified
run: |
CHANGED=$(
python - <<'EOF'
import json
with open("data.json.base") as f: base = json.load(f)
with open("data.json.head") as f: head = json.load(f)
changed = []
for k, v in head.items():
if k not in base or base[k] != v:
changed.append(k)
print(",".join(sorted(changed)))
EOF
)
# Preserve changelist
echo -e ">>> Changed targets: \n$(echo $CHANGED | tr ',' '\n')"
echo "changed_targets=$CHANGED" >> "$GITHUB_OUTPUT"
- name: Validate modified targets
if: steps.discover-modified.outputs.changed_targets != ''
continue-on-error: true
run: |
poetry run pytest -q --tb no -rA -m validate_targets -n 20 \
--chunked-sites "${{ steps.discover-modified.outputs.changed_targets }}" \
--junitxml=validation_results.xml
- name: Prepare validation summary
if: steps.discover-modified.outputs.changed_targets != ''
id: prepare-summary
run: |
summary=$(
poetry run python devel/summarize_site_validation.py validation_results.xml || echo "Failed to generate summary of test results"
)
echo "$summary" > validation_summary.md
- name: Announce validation results
if: steps.discover-modified.outputs.changed_targets != ''
uses: actions/github-script@v8
with:
script: |
const fs = require('fs');
const body = fs.readFileSync('validation_summary.md', 'utf8');
await github.rest.issues.createComment({
issue_number: context.payload.pull_request.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: body,
});
- name: This step shows as ran when no modifications are found
if: steps.discover-modified.outputs.changed_targets == ''
run: |
echo "No modified targets found"

View File

@ -2,9 +2,9 @@
# 1. Update the version tag in the Dockerfile to match the version in sherlock/__init__.py
# 2. Update the VCS_REF tag to match the tagged version's FULL commit hash
# 3. Build image with BOTH latest and version tags
# i.e. `docker build -t sherlock/sherlock:0.15.0 -t sherlock/sherlock:latest .`
# i.e. `docker build -t sherlock/sherlock:0.16.0 -t sherlock/sherlock:latest .`
FROM python:3.12-slim-bullseye as build
FROM python:3.12-slim-bullseye AS build
WORKDIR /sherlock
RUN pip3 install --no-cache-dir --upgrade pip

View File

@ -0,0 +1,72 @@
#!/usr/bin/env python
# This module summarizes the results of site validation tests queued by
# workflow validate_modified_targets for presentation in Issue comments.
from defusedxml import ElementTree as ET
import sys
from pathlib import Path
def summarize_junit_xml(xml_path: Path) -> str:
tree = ET.parse(xml_path)
root = tree.getroot()
suite = root.find('testsuite')
pass_message: str = ":heavy_check_mark: &nbsp; Pass"
fail_message: str = ":x: &nbsp; Fail"
if suite is None:
raise ValueError("Invalid JUnit XML: No testsuite found")
summary_lines: list[str] = []
summary_lines.append("#### Automatic validation of changes\n")
summary_lines.append("| Target | F+ Check | F- Check |")
summary_lines.append("|---|---|---|")
failures = int(suite.get('failures', 0))
errors_detected: bool = False
results: dict[str, dict[str, str]] = {}
for testcase in suite.findall('testcase'):
test_name = testcase.get('name').split('[')[0]
site_name = testcase.get('name').split('[')[1].rstrip(']')
failure = testcase.find('failure')
error = testcase.find('error')
if site_name not in results:
results[site_name] = {}
if test_name == "test_false_neg":
results[site_name]['F- Check'] = pass_message if failure is None and error is None else fail_message
elif test_name == "test_false_pos":
results[site_name]['F+ Check'] = pass_message if failure is None and error is None else fail_message
if error is not None:
errors_detected = True
for result in results:
summary_lines.append(f"| {result} | {results[result].get('F+ Check', 'Error!')} | {results[result].get('F- Check', 'Error!')} |")
if failures > 0:
summary_lines.append("\n___\n" +
"\nFailures were detected on at least one updated target. Commits containing accuracy failures" +
" will often not be merged (unless a rationale is provided, such as false negatives due to regional differences).")
if errors_detected:
summary_lines.append("\n___\n" +
"\n**Errors were detected during validation. Please review the workflow logs.**")
return "\n".join(summary_lines)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: summarize_site_validation.py <junit-xml-file>")
sys.exit(1)
xml_path: Path = Path(sys.argv[1])
if not xml_path.is_file():
print(f"Error: File '{xml_path}' does not exist.")
sys.exit(1)
summary: str = summarize_junit_xml(xml_path)
print(summary)

View File

@ -1,6 +1,6 @@
<p align=center>
<p align="center">
<br>
<a href="https://sherlock-project.github.io/" target="_blank"><img src="images/sherlock-logo.png"/></a>
<a href="https://sherlock-project.github.io/" target="_blank"><img src="images/sherlock-logo.png" alt="sherlock"/></a>
<br>
<span>Hunt down social media accounts by username across <a href="https://sherlockproject.xyz/sites">400+ social networks</a></span>
<br>
@ -15,8 +15,7 @@
</p>
<p align="center">
<img width="70%" height="70%" src="images/demo.png"/>
</a>
<img width="70%" height="70%" src="images/demo.png" alt="demo"/>
</p>
@ -115,14 +114,14 @@ $ echo '{"usernames":["user123"]}' | apify call -so netmilk/sherlock
}]
```
Read more about the [Sherlock Actor](../.actor/README.md), including how to use it programmaticaly via the Apify [API](https://apify.com/netmilk/sherlock/api?fpr=sherlock), [CLI](https://docs.apify.com/cli/?fpr=sherlock) and [JS/TS and Python SDKs](https://docs.apify.com/sdk?fpr=sherlock).
Read more about the [Sherlock Actor](../.actor/README.md), including how to use it programmatically via the Apify [API](https://apify.com/netmilk/sherlock/api?fpr=sherlock), [CLI](https://docs.apify.com/cli/?fpr=sherlock) and [JS/TS and Python SDKs](https://docs.apify.com/sdk?fpr=sherlock).
## Credits
Thank you to everyone who has contributed to Sherlock! ❤️
<a href="https://github.com/sherlock-project/sherlock/graphs/contributors">
<img src="https://contrib.rocks/image?&columns=25&max=10000&&repo=sherlock-project/sherlock" noZoom />
<img src="https://contrib.rocks/image?&columns=25&max=10000&&repo=sherlock-project/sherlock" alt="contributors"/>
</a>
## Star history

View File

@ -1982,3 +1982,16 @@ __2025-02-16 :__ Unsure if any way to view profiles exists now
"username_claimed": "t3dotgg"
}
```
## TorrentGalaxy
__2025-07-06 :__ Site appears to have gone offline in March and hasn't come back
```json
"TorrentGalaxy": {
"errorMsg": "<title>TGx:Can't show details</title>",
"errorType": "message",
"regexCheck": "^[A-Za-z0-9]{3,15}$",
"url": "https://torrentgalaxy.to/profile/{}",
"urlMain": "https://torrentgalaxy.to/",
"username_claimed": "GalaxyRG"
},
```

View File

@ -8,8 +8,7 @@ source = "init"
[tool.poetry]
name = "sherlock-project"
# single source of truth for version is __init__.py
version = "0"
version = "0.16.0"
description = "Hunt down social media accounts by username across social networks"
license = "MIT"
authors = [
@ -47,15 +46,19 @@ PySocks = "^1.7.0"
requests = "^2.22.0"
requests-futures = "^1.0.0"
stem = "^1.8.0"
torrequest = "^0.1.0"
pandas = "^2.2.1"
openpyxl = "^3.0.10"
[tool.poetry.extras]
tor = ["torrequest"]
tomli = "^2.2.1"
[tool.poetry.group.dev.dependencies]
jsonschema = "^4.0.0"
rstr = "^3.2.2"
pytest = "^8.4.2"
pytest-xdist = "^3.8.0"
[tool.poetry.group.ci.dependencies]
defusedxml = "^0.7.1"
[tool.poetry.scripts]
sherlock = 'sherlock_project.sherlock:main'

View File

@ -1,4 +1,7 @@
[pytest]
addopts = --strict-markers
addopts = --strict-markers -m "not validate_targets"
markers =
online: mark tests are requiring internet access.
validate_targets: mark tests for sweeping manifest validation (sends many requests).
validate_targets_fp: validate_targets, false positive tests only.
validate_targets_fn: validate_targets, false negative tests only.

View File

@ -5,11 +5,26 @@ networks.
"""
from importlib.metadata import version as pkg_version, PackageNotFoundError
import pathlib
import tomli
def get_version() -> str:
"""Fetch the version number of the installed package."""
try:
return pkg_version("sherlock_project")
except PackageNotFoundError:
pyproject_path: pathlib.Path = pathlib.Path(__file__).resolve().parent.parent / "pyproject.toml"
with pyproject_path.open("rb") as f:
pyproject_data = tomli.load(f)
return pyproject_data["tool"]["poetry"]["version"]
# This variable is only used to check for ImportErrors induced by users running as script rather than as module or package
import_error_test_var = None
__shortname__ = "Sherlock"
__longname__ = "Sherlock: Find Usernames Across Social Networks"
__version__ = "0.15.0"
__version__ = get_version()
forge_api_latest_release = "https://api.github.com/repos/sherlock-project/sherlock/releases/latest"

View File

@ -79,13 +79,13 @@
"username_claimed": "pink"
},
"AllMyLinks": {
"errorMsg": "Not Found",
"errorType": "message",
"regexCheck": "^[a-z0-9][a-z0-9-]{2,32}$",
"url": "https://allmylinks.com/{}",
"urlMain": "https://allmylinks.com/",
"username_claimed": "blue"
},
"errorMsg": "Page not found",
"errorType": "message",
"regexCheck": "^[a-z0-9][a-z0-9-]{2,32}$",
"url": "https://allmylinks.com/{}",
"urlMain": "https://allmylinks.com/",
"username_claimed": "blue"
},
"AniWorld": {
"errorMsg": "Dieses Profil ist nicht verf\u00fcgbar",
"errorType": "message",
@ -115,12 +115,20 @@
"username_claimed": "lio24d"
},
"Apple Discussions": {
"errorMsg": "The page you tried was not found. You may have used an outdated link or may have typed the address (URL) incorrectly.",
"errorMsg": "Looking for something in Apple Support Communities?",
"errorType": "message",
"url": "https://discussions.apple.com/profile/{}",
"urlMain": "https://discussions.apple.com",
"username_claimed": "jason"
},
"Aparat": {
"errorType": "status_code",
"request_method": "GET",
"url": "https://www.aparat.com/{}/",
"urlMain": "https://www.aparat.com/",
"urlProbe": "https://www.aparat.com/api/fa/v1/user/user/information/username/{}",
"username_claimed": "jadi"
},
"Archive of Our Own": {
"errorType": "status_code",
"regexCheck": "^[^.]*?$",
@ -250,6 +258,12 @@
"urlMain": "https://www.blipfoto.com/",
"username_claimed": "blue"
},
"Blitz Tactics": {
"errorType": "status_code",
"url": "https://blitztactics.com/{}",
"urlMain": "https://blitztactics.com/",
"username_claimed": "Lance5500"
},
"Blogger": {
"errorType": "status_code",
"regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$",
@ -257,13 +271,12 @@
"urlMain": "https://www.blogger.com/",
"username_claimed": "blue"
},
"BoardGameGeek": {
"errorType": "message",
"regexCheck": "^[a-zA-Z0-9_]*$",
"errorMsg": "User not found",
"url": "https://boardgamegeek.com/user/{}",
"urlMain": "https://boardgamegeek.com",
"username_claimed": "blue"
"Bluesky": {
"errorType": "status_code",
"url": "https://bsky.app/profile/{}.bsky.social",
"urlProbe": "https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile?actor={}.bsky.social",
"urlMain": "https://bsky.app/",
"username_claimed": "mcuban"
},
"BongaCams": {
"errorType": "status_code",
@ -278,6 +291,14 @@
"urlMain": "https://www.bookcrossing.com/",
"username_claimed": "blue"
},
"BoardGameGeek": {
"errorMsg": "\"isValid\":true",
"errorType": "message",
"url": "https://boardgamegeek.com/user/{}",
"urlMain": "https://boardgamegeek.com/",
"urlProbe": "https://api.geekdo.com/api/accounts/validate/username?username={}",
"username_claimed": "blue"
},
"BraveCommunity": {
"errorType": "status_code",
"url": "https://community.brave.com/u/{}/",
@ -357,6 +378,12 @@
"urlMain": "https://career.habr.com/",
"username_claimed": "blue"
},
"CashApp": {
"errorType": "status_code",
"url": "https://cash.app/${}",
"urlMain": "https://cash.app",
"username_claimed": "hotdiggitydog"
},
"Championat": {
"errorType": "status_code",
"url": "https://www.championat.com/user/{}",
@ -479,7 +506,8 @@
"username_claimed": "hacker"
},
"Code Sandbox": {
"errorType": "status_code",
"errorType": "message",
"errorMsg": "Whoops, page not found",
"url": "https://codesandbox.io/u/{}",
"urlMain": "https://codesandbox.io",
"username_claimed": "icyjoseph"
@ -551,8 +579,7 @@
"username_claimed": "brown"
},
"CyberDefenders": {
"errorMsg": "<title>Blue Team Training for SOC analysts and DFIR - CyberDefenders</title>",
"errorType": "message",
"errorType": "status_code",
"regexCheck": "^[^\\/:*?\"<>|@]{3,50}$",
"request_method": "GET",
"url": "https://cyberdefenders.org/p/{}",
@ -579,6 +606,12 @@
"urlMain": "https://www.dailymotion.com/",
"username_claimed": "blue"
},
"dcinside": {
"errorType": "status_code",
"url": "https://gallog.dcinside.com/{}",
"urlMain": "https://www.dcinside.com/",
"username_claimed": "anrbrb"
},
"Dealabs": {
"errorMsg": "La page que vous essayez",
"errorType": "message",
@ -587,20 +620,21 @@
"urlMain": "https://www.dealabs.com/",
"username_claimed": "blue"
},
"DeviantART": {
"errorType": "status_code",
"regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$",
"url": "https://{}.deviantart.com",
"urlMain": "https://deviantart.com",
"username_claimed": "blue"
},
"DeviantArt": {
"errorType": "message",
"errorMsg": "Llama Not Found",
"regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$",
"url": "https://www.deviantart.com/{}",
"urlMain": "https://www.deviantart.com/",
"username_claimed": "blue"
},
"DigitalSpy": {
"errorMsg": "The page you were looking for could not be found.",
"errorType": "message",
"url": "https://forums.digitalspy.com/profile/{}",
"urlMain": "https://forums.digitalspy.com/",
"username_claimed": "blue",
"regexCheck": "^\\w{3,20}$"
"errorMsg": "The page you were looking for could not be found.",
"errorType": "message",
"url": "https://forums.digitalspy.com/profile/{}",
"urlMain": "https://forums.digitalspy.com/",
"username_claimed": "blue",
"regexCheck": "^\\w{3,20}$"
},
"Discogs": {
"errorType": "status_code",
@ -786,13 +820,12 @@
"urlMain": "https://fosstodon.org/",
"username_claimed": "blue"
},
"Freelance.habr": {
"errorMsg": "<div class=\"icon_user_locked\"></div>",
"errorType": "message",
"regexCheck": "^((?!\\.).)*$",
"url": "https://freelance.habr.com/freelancers/{}",
"urlMain": "https://freelance.habr.com/",
"username_claimed": "adam"
"Framapiaf": {
"errorType": "status_code",
"regexCheck": "^[a-zA-Z0-9_]{1,30}$",
"url": "https://framapiaf.org/@{}",
"urlMain": "https://framapiaf.org",
"username_claimed": "pylapp"
},
"Freelancer": {
"errorMsg": "\"users\":{}",
@ -1129,6 +1162,13 @@
"urlProbe": "https://imginn.com/{}",
"username_claimed": "instagram"
},
"Instapaper": {
"errorType": "status_code",
"request_method": "GET",
"url": "https://www.instapaper.com/p/{}",
"urlMain": "https://www.instapaper.com/",
"username_claimed": "john"
},
"Instructables": {
"errorType": "status_code",
"url": "https://www.instructables.com/member/{}",
@ -1241,6 +1281,13 @@
"urlMain": "https://linux.org.ru/",
"username_claimed": "red"
},
"Laracast": {
"errorType":"status_code",
"url": "https://laracasts.com/@{}",
"urlMain": "https://laracasts.com/",
"regexCheck": "^[a-zA-Z0-9_-]{3,}$",
"username_claimed": "user1"
},
"Launchpad": {
"errorType": "status_code",
"url": "https://launchpad.net/~{}",
@ -1298,6 +1345,12 @@
"urlMain": "https://linktr.ee/",
"username_claimed": "anne"
},
"LinuxFR.org": {
"errorType": "status_code",
"url": "https://linuxfr.org/users/{}",
"urlMain": "https://linuxfr.org/",
"username_claimed": "pylapp"
},
"Listed": {
"errorType": "response_url",
"errorUrl": "https://listed.to/@{}",
@ -1338,6 +1391,13 @@
"urlMain": "https://forums.mmorpg.com/",
"username_claimed": "goku"
},
"Mamot": {
"errorType": "status_code",
"regexCheck": "^[a-zA-Z0-9_]{1,30}$",
"url": "https://mamot.fr/@{}",
"urlMain": "https://mamot.fr/",
"username_claimed": "anciensEnssat"
},
"Medium": {
"errorMsg": "<body",
"errorType": "message",
@ -1353,8 +1413,8 @@
"username_claimed": "blue"
},
"Minecraft": {
"errorCode": 204,
"errorType": "status_code",
"errorMsg": "Couldn't find any profile with name",
"errorType": "message",
"url": "https://api.mojang.com/users/profiles/minecraft/{}",
"urlMain": "https://minecraft.net/",
"username_claimed": "blue"
@ -1400,12 +1460,12 @@
"username_claimed": "blue"
},
"Mydramalist": {
"errorMsg": "Sign in - MyDramaList",
"errorType": "message",
"url": "https://www.mydramalist.com/profile/{}",
"urlMain": "https://mydramalist.com",
"username_claimed": "elhadidy12398"
},
"errorMsg": "The requested page was not found",
"errorType": "message",
"url": "https://www.mydramalist.com/profile/{}",
"urlMain": "https://mydramalist.com",
"username_claimed": "elhadidy12398"
},
"Myspace": {
"errorType": "status_code",
"url": "https://myspace.com/{}",
@ -1419,6 +1479,13 @@
"urlMain": "https://www.native-instruments.com/forum/",
"username_claimed": "jambert"
},
"namuwiki": {
"__comment__": "This is a Korean site and it's expected to return false negatives in certain other regions.",
"errorType": "status_code",
"url": "https://namu.wiki/w/%EC%82%AC%EC%9A%A9%EC%9E%90:{}",
"urlMain": "https://namu.wiki/",
"username_claimed": "namu"
},
"NationStates Nation": {
"errorMsg": "Was this your nation? It may have ceased to exist due to inactivity, but can rise again!",
"errorType": "message",
@ -1499,6 +1566,13 @@
"urlMain": "https://nyaa.si/",
"username_claimed": "blue"
},
"Open Collective": {
"errorMsg": "Oops! Page not found",
"errorType": "message",
"url": "https://opencollective.com/{}",
"urlMain": "https://opencollective.com/",
"username_claimed": "pylapp"
},
"OpenStreetMap": {
"errorType": "status_code",
"regexCheck": "^[^.]*?$",
@ -1519,6 +1593,13 @@
"urlMain": "https://ourdjtalk.com/",
"username_claimed": "steve"
},
"Outgress": {
"errorMsg": "Outgress - Error",
"errorType": "message",
"url": "https://outgress.com/agents/{}",
"urlMain": "https://outgress.com/",
"username_claimed": "pylapp"
},
"PCGamer": {
"errorMsg": "The specified member cannot be found. Please enter a member's entire name.",
"errorType": "message",
@ -1580,12 +1661,31 @@
"urlMain": "https://www.pinkbike.com/",
"username_claimed": "blue"
},
"pixelfed.social": {
"errorType": "status_code",
"url": "https://pixelfed.social/{}/",
"urlMain": "https://pixelfed.social",
"username_claimed": "pylapp"
},
"PlayStore": {
"errorType": "status_code",
"url": "https://play.google.com/store/apps/developer?id={}",
"urlMain": "https://play.google.com/store",
"username_claimed": "Facebook"
},
"Playstrategy": {
"errorType": "status_code",
"url": "https://playstrategy.org/@/{}",
"urlMain": "https://playstrategy.org",
"username_claimed": "oruro"
},
"Plurk": {
"errorMsg": "User Not Found!",
"errorType": "message",
"url": "https://www.plurk.com/{}",
"urlMain": "https://www.plurk.com/",
"username_claimed": "plurkoffice"
},
"PocketStars": {
"errorMsg": "Join Your Favorite Adult Stars",
"errorType": "message",
@ -1633,6 +1733,20 @@
"urlMain": "https://www.producthunt.com/",
"username_claimed": "jenny"
},
"programming.dev": {
"errorMsg": "Error!",
"errorType": "message",
"url": "https://programming.dev/u/{}",
"urlMain": "https://programming.dev",
"username_claimed": "pylapp"
},
"Pychess": {
"errorType": "message",
"errorMsg": "404",
"url": "https://www.pychess.org/@/{}",
"urlMain": "https://www.pychess.org",
"username_claimed": "gbtami"
},
"PromoDJ": {
"errorType": "status_code",
"url": "http://promodj.com/{}",
@ -1722,8 +1836,7 @@
"username_claimed": "blue"
},
"Roblox": {
"errorMsg": "Page cannot be found or no longer exists",
"errorType": "message",
"errorType": "status_code",
"url": "https://www.roblox.com/user.aspx?username={}",
"urlMain": "https://www.roblox.com/",
"username_claimed": "bluewolfekiller"
@ -1831,7 +1944,7 @@
},
"SlideShare": {
"errorType": "message",
"errorMsg": "<title>Username available</title>",
"errorMsg": "<title>Page no longer exists</title>",
"url": "https://slideshare.net/{}",
"urlMain": "https://slideshare.net/",
"username_claimed": "blue"
@ -1865,6 +1978,13 @@
"urlMain": "https://www.snapchat.com",
"username_claimed": "teamsnapchat"
},
"SOOP": {
"errorType": "status_code",
"url": "https://www.sooplive.co.kr/station/{}",
"urlMain": "https://www.sooplive.co.kr/",
"urlProbe": "https://api-channel.sooplive.co.kr/v1.1/channel/{}/station",
"username_claimed": "udkn"
},
"SoundCloud": {
"errorType": "status_code",
"url": "https://soundcloud.com/{}",
@ -1884,6 +2004,12 @@
"urlMain": "https://soylentnews.org",
"username_claimed": "adam"
},
"SpeakerDeck": {
"errorType": "status_code",
"url": "https://speakerdeck.com/{}",
"urlMain": "https://speakerdeck.com/",
"username_claimed": "pylapp"
},
"Speedrun.com": {
"errorType": "status_code",
"url": "https://speedrun.com/users/{}",
@ -2025,6 +2151,12 @@
"urlMain": "https://themeforest.net/",
"username_claimed": "user"
},
"tistory": {
"errorType": "status_code",
"url": "https://{}.tistory.com/",
"urlMain": "https://www.tistory.com/",
"username_claimed": "notice"
},
"TnAFlix": {
"errorType": "status_code",
"isNSFW": true,
@ -2032,14 +2164,6 @@
"urlMain": "https://www.tnaflix.com/",
"username_claimed": "hacker"
},
"TorrentGalaxy": {
"errorMsg": "<title>TGx:Can't show details</title>",
"errorType": "message",
"regexCheck": "^[A-Za-z0-9]{3,15}$",
"url": "https://torrentgalaxy.to/profile/{}",
"urlMain": "https://torrentgalaxy.to/",
"username_claimed": "GalaxyRG"
},
"TradingView": {
"errorType": "status_code",
"request_method": "GET",
@ -2706,7 +2830,7 @@
"username_claimed": "green"
},
"threads": {
"errorMsg": "<title>Threads</title>",
"errorMsg": "<title>Threads • Log in</title>",
"errorType": "message",
"headers": {
"Sec-Fetch-Mode": "navigate"
@ -2721,12 +2845,24 @@
"urlMain": "https://www.toster.ru/",
"username_claimed": "adam"
},
"tumblr": {
"errorType": "status_code",
"url": "https://{}.tumblr.com/",
"urlMain": "https://www.tumblr.com/",
"username_claimed": "goku"
},
"uid": {
"errorType": "status_code",
"url": "http://uid.me/{}",
"urlMain": "https://uid.me/",
"username_claimed": "blue"
},
"write.as": {
"errorType": "status_code",
"url": "https://write.as/{}",
"urlMain": "https://write.as",
"username_claimed": "pylapp"
},
"xHamster": {
"errorType": "status_code",
"isNSFW": true,
@ -2747,5 +2883,13 @@
"urlProbe": "https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile?actor={}.bsky.social",
"urlMain": "https://bsky.app/",
"username_claimed": "mcuban"
},
"Platzi": {
"errorType": "status_code",
"errorCode": 404,
"url": "https://platzi.com/p/{}/",
"urlMain": "https://platzi.com/",
"username_claimed": "freddier",
"request_method": "GET"
}
}

View File

@ -1,80 +1,149 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "Sherlock Target Manifest",
"description": "Social media targets to probe for the existence of known usernames",
"type": "object",
"properties": {
"$schema": { "type": "string" }
},
"patternProperties": {
"^(?!\\$).*?$": {
"type": "object",
"description": "Target name and associated information (key should be human readable name)",
"required": [ "url", "urlMain", "errorType", "username_claimed" ],
"properties": {
"url": { "type": "string" },
"urlMain": { "type": "string" },
"urlProbe": { "type": "string" },
"username_claimed": { "type": "string" },
"regexCheck": { "type": "string" },
"isNSFW": { "type": "boolean" },
"headers": { "type": "object" },
"request_payload": { "type": "object" },
"__comment__": {
"type": "string",
"description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock."
},
"tags": {
"oneOf": [
{ "$ref": "#/$defs/tag" },
{ "type": "array", "items": { "$ref": "#/$defs/tag" } }
]
},
"request_method": {
"type": "string",
"enum": [ "GET", "POST", "HEAD", "PUT" ]
},
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "Sherlock Target Manifest",
"description": "Social media targets to probe for the existence of known usernames",
"type": "object",
"properties": {
"$schema": { "type": "string" }
},
"patternProperties": {
"^(?!\\$).*?$": {
"type": "object",
"description": "Target name and associated information (key should be human readable name)",
"required": ["url", "urlMain", "errorType", "username_claimed"],
"properties": {
"url": { "type": "string" },
"urlMain": { "type": "string" },
"urlProbe": { "type": "string" },
"username_claimed": { "type": "string" },
"regexCheck": { "type": "string" },
"isNSFW": { "type": "boolean" },
"headers": { "type": "object" },
"request_payload": { "type": "object" },
"__comment__": {
"type": "string",
"description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock."
},
"tags": {
"oneOf": [
{ "$ref": "#/$defs/tag" },
{ "type": "array", "items": { "$ref": "#/$defs/tag" } }
]
},
"request_method": {
"type": "string",
"enum": ["GET", "POST", "HEAD", "PUT"]
},
"errorType": {
"oneOf": [
{
"type": "string",
"enum": ["message", "response_url", "status_code"]
},
{
"type": "array",
"items": {
"type": "string",
"enum": ["message", "response_url", "status_code"]
}
}
]
},
"errorMsg": {
"oneOf": [
{ "type": "string" },
{ "type": "array", "items": { "type": "string" } }
]
},
"errorCode": {
"oneOf": [
{ "type": "integer" },
{ "type": "array", "items": { "type": "integer" } }
]
},
"errorUrl": { "type": "string" },
"response_url": { "type": "string" }
},
"dependencies": {
"errorMsg": {
"oneOf": [
{ "properties": { "errorType": { "const": "message" } } },
{
"properties": {
"errorType": {
"type": "string",
"enum": [ "message", "response_url", "status_code" ]
},
"errorMsg": {
"oneOf": [
{ "type": "string" },
{ "type": "array", "items": { "type": "string" } }
]
},
"errorCode": {
"oneOf": [
{ "type": "integer" },
{ "type": "array", "items": { "type": "integer" } }
]
},
"errorUrl": { "type": "string" },
"response_url": { "type": "string" }
},
"dependencies": {
"errorMsg": {
"properties" : { "errorType": { "const": "message" } }
},
"errorUrl": {
"properties": { "errorType": { "const": "response_url" } }
},
"errorCode": {
"properties": { "errorType": { "const": "status_code" } }
"type": "array",
"contains": { "const": "message" }
}
},
"if": { "properties": { "errorType": { "const": "message" } } },
"then": { "required": [ "errorMsg" ] },
"else": {
"if": { "properties": { "errorType": { "const": "response_url" } } },
"then": { "required": [ "errorUrl" ] }
},
"additionalProperties": false
}
}
]
},
"errorUrl": {
"oneOf": [
{ "properties": { "errorType": { "const": "response_url" } } },
{
"properties": {
"errorType": {
"type": "array",
"contains": { "const": "response_url" }
}
}
}
]
},
"errorCode": {
"oneOf": [
{ "properties": { "errorType": { "const": "status_code" } } },
{
"properties": {
"errorType": {
"type": "array",
"contains": { "const": "status_code" }
}
}
}
]
}
},
"additionalProperties": false,
"$defs": {
"tag": { "type": "string", "enum": [ "adult", "gaming" ] }
},
"allOf": [
{
"if": {
"anyOf": [
{ "properties": { "errorType": { "const": "message" } } },
{
"properties": {
"errorType": {
"type": "array",
"contains": { "const": "message" }
}
}
}
]
},
"then": { "required": ["errorMsg"] }
},
{
"if": {
"anyOf": [
{ "properties": { "errorType": { "const": "response_url" } } },
{
"properties": {
"errorType": {
"type": "array",
"contains": { "const": "response_url" }
}
}
}
]
},
"then": { "required": ["errorUrl"] }
}
],
"additionalProperties": false
}
},
"additionalProperties": false,
"$defs": {
"tag": { "type": "string", "enum": ["adult", "gaming"] }
}
}

View File

@ -169,14 +169,12 @@ def multiple_usernames(username):
def sherlock(
username: str,
site_data: dict,
site_data: dict[str, dict[str, str]],
query_notify: QueryNotify,
tor: bool = False,
unique_tor: bool = False,
dump_response: bool = False,
proxy: Optional[str] = None,
timeout: int = 60,
):
) -> dict[str, dict[str, str | QueryResult]]:
"""Run Sherlock Analysis.
Checks for existence of username on various social media sites.
@ -188,8 +186,6 @@ def sherlock(
query_notify -- Object with base type of QueryNotify().
This will be used to notify the caller about
query results.
tor -- Boolean indicating whether to use a tor circuit for the requests.
unique_tor -- Boolean indicating whether to use a new tor circuit for each request.
proxy -- String indicating the proxy URL
timeout -- Time in seconds to wait before timing out request.
Default is 60 seconds.
@ -210,32 +206,9 @@ def sherlock(
# Notify caller that we are starting the query.
query_notify.start(username)
# Create session based on request methodology
if tor or unique_tor:
try:
from torrequest import TorRequest # noqa: E402
except ImportError:
print("Important!")
print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.")
print("> If you've installed Sherlock via pip, you can include the optional dependency via `pip install 'sherlock-project[tor]'`.")
print("> Other packages should refer to their documentation, or install it separately with `pip install torrequest`.\n")
sys.exit(query_notify.finish())
print("Important!")
print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.")
# Requests using Tor obfuscation
try:
underlying_request = TorRequest()
except OSError:
print("Tor not found in system path. Unable to continue.\n")
sys.exit(query_notify.finish())
underlying_session = underlying_request.session
else:
# Normal requests
underlying_session = requests.session()
underlying_request = requests.Request()
# Normal requests
underlying_session = requests.session()
# Limit number of workers to 20.
# This is probably vastly overkill.
@ -359,15 +332,10 @@ def sherlock(
# Store future in data for access later
net_info["request_future"] = future
# Reset identify for tor (if needed)
if unique_tor:
underlying_request.reset_identity()
# Add this site's results into final dictionary with all the other results.
results_total[social_network] = results_site
# Open the file containing account links
# Core logic: If tor requests, make them here. If multi-threaded requests, wait for responses
for social_network, net_info in site_data.items():
# Retrieve results again
results_site = results_total.get(social_network)
@ -381,6 +349,8 @@ def sherlock(
# Get the expected error type
error_type = net_info["errorType"]
if isinstance(error_type, str):
error_type: list[str] = [error_type]
# Retrieve future and ensure it has finished
future = net_info["request_future"]
@ -425,58 +395,60 @@ def sherlock(
elif any(hitMsg in r.text for hitMsg in WAFHitMsgs):
query_status = QueryStatus.WAF
elif error_type == "message":
# error_flag True denotes no error found in the HTML
# error_flag False denotes error found in the HTML
error_flag = True
errors = net_info.get("errorMsg")
# errors will hold the error message
# it can be string or list
# by isinstance method we can detect that
# and handle the case for strings as normal procedure
# and if its list we can iterate the errors
if isinstance(errors, str):
# Checks if the error message is in the HTML
# if error is present we will set flag to False
if errors in r.text:
error_flag = False
else:
# If it's list, it will iterate all the error message
for error in errors:
if error in r.text:
error_flag = False
break
if error_flag:
query_status = QueryStatus.CLAIMED
else:
query_status = QueryStatus.AVAILABLE
elif error_type == "status_code":
error_codes = net_info.get("errorCode")
query_status = QueryStatus.CLAIMED
# Type consistency, allowing for both singlets and lists in manifest
if isinstance(error_codes, int):
error_codes = [error_codes]
if error_codes is not None and r.status_code in error_codes:
query_status = QueryStatus.AVAILABLE
elif r.status_code >= 300 or r.status_code < 200:
query_status = QueryStatus.AVAILABLE
elif error_type == "response_url":
# For this detection method, we have turned off the redirect.
# So, there is no need to check the response URL: it will always
# match the request. Instead, we will ensure that the response
# code indicates that the request was successful (i.e. no 404, or
# forward to some odd redirect).
if 200 <= r.status_code < 300:
query_status = QueryStatus.CLAIMED
else:
query_status = QueryStatus.AVAILABLE
else:
# It should be impossible to ever get here...
raise ValueError(
f"Unknown Error Type '{error_type}' for " f"site '{social_network}'"
)
if any(errtype not in ["message", "status_code", "response_url"] for errtype in error_type):
error_context = f"Unknown error type '{error_type}' for {social_network}"
query_status = QueryStatus.UNKNOWN
else:
if "message" in error_type:
# error_flag True denotes no error found in the HTML
# error_flag False denotes error found in the HTML
error_flag = True
errors = net_info.get("errorMsg")
# errors will hold the error message
# it can be string or list
# by isinstance method we can detect that
# and handle the case for strings as normal procedure
# and if its list we can iterate the errors
if isinstance(errors, str):
# Checks if the error message is in the HTML
# if error is present we will set flag to False
if errors in r.text:
error_flag = False
else:
# If it's list, it will iterate all the error message
for error in errors:
if error in r.text:
error_flag = False
break
if error_flag:
query_status = QueryStatus.CLAIMED
else:
query_status = QueryStatus.AVAILABLE
if "status_code" in error_type and query_status is not QueryStatus.AVAILABLE:
error_codes = net_info.get("errorCode")
query_status = QueryStatus.CLAIMED
# Type consistency, allowing for both singlets and lists in manifest
if isinstance(error_codes, int):
error_codes = [error_codes]
if error_codes is not None and r.status_code in error_codes:
query_status = QueryStatus.AVAILABLE
elif r.status_code >= 300 or r.status_code < 200:
query_status = QueryStatus.AVAILABLE
if "response_url" in error_type and query_status is not QueryStatus.AVAILABLE:
# For this detection method, we have turned off the redirect.
# So, there is no need to check the response URL: it will always
# match the request. Instead, we will ensure that the response
# code indicates that the request was successful (i.e. no 404, or
# forward to some odd redirect).
if 200 <= r.status_code < 300:
query_status = QueryStatus.CLAIMED
else:
query_status = QueryStatus.AVAILABLE
if dump_response:
print("+++++++++++++++++++++")
@ -507,7 +479,7 @@ def sherlock(
print("+++++++++++++++++++++")
# Notify caller about results of query.
result = QueryResult(
result: QueryResult = QueryResult(
username=username,
site_name=social_network,
site_url_user=url,
@ -596,22 +568,6 @@ def main():
dest="output",
help="If using single username, the output of the result will be saved to this file.",
)
parser.add_argument(
"--tor",
"-t",
action="store_true",
dest="tor",
default=False,
help="Make requests over Tor; increases runtime; requires Tor to be installed and in system path.",
)
parser.add_argument(
"--unique-tor",
"-u",
action="store_true",
dest="unique_tor",
default=False,
help="Make requests over Tor with new Tor circuit after each request; increases runtime; requires Tor to be installed and in system path.",
)
parser.add_argument(
"--csv",
action="store_true",
@ -719,12 +675,30 @@ def main():
help="Include checking of NSFW sites from default list.",
)
# TODO deprecated in favor of --txt, retained for workflow compatibility, to be removed
# in future release
parser.add_argument(
"--no-txt",
action="store_true",
dest="no_txt",
default=False,
help="Disable creation of a txt file",
help="Disable creation of a txt file - WILL BE DEPRECATED",
)
parser.add_argument(
"--txt",
action="store_true",
dest="output_txt",
default=False,
help="Enable creation of a txt file",
)
parser.add_argument(
"--ignore-exclusions",
action="store_true",
dest="ignore_exclusions",
default=False,
help="Ignore upstream exclusions (may return more false positives)",
)
args = parser.parse_args()
@ -734,7 +708,7 @@ def main():
# Check for newer version of Sherlock. If it exists, let the user know about it
try:
latest_release_raw = requests.get(forge_api_latest_release).text
latest_release_raw = requests.get(forge_api_latest_release, timeout=10).text
latest_release_json = json_loads(latest_release_raw)
latest_remote_tag = latest_release_json["tag_name"]
@ -747,22 +721,10 @@ def main():
except Exception as error:
print(f"A problem occurred while checking for an update: {error}")
# Argument check
# TODO regex check on args.proxy
if args.tor and (args.proxy is not None):
raise Exception("Tor and Proxy cannot be set at the same time.")
# Make prompts
if args.proxy is not None:
print("Using the proxy: " + args.proxy)
if args.tor or args.unique_tor:
print("Using Tor to make requests")
print(
"Warning: some websites might refuse connecting over Tor, so note that using this option might increase connection errors."
)
if args.no_color:
# Disable color output.
init(strip=True, convert=False)
@ -784,7 +746,8 @@ def main():
try:
if args.local:
sites = SitesInformation(
os.path.join(os.path.dirname(__file__), "resources/data.json")
os.path.join(os.path.dirname(__file__), "resources/data.json"),
honor_exclusions=False,
)
else:
json_file_location = args.json_file
@ -793,7 +756,7 @@ def main():
if args.json_file.isnumeric():
pull_number = args.json_file
pull_url = f"https://api.github.com/repos/sherlock-project/sherlock/pulls/{pull_number}"
pull_request_raw = requests.get(pull_url).text
pull_request_raw = requests.get(pull_url, timeout=10).text
pull_request_json = json_loads(pull_request_raw)
# Check if it's a valid pull request
@ -804,7 +767,11 @@ def main():
head_commit_sha = pull_request_json["head"]["sha"]
json_file_location = f"https://raw.githubusercontent.com/sherlock-project/sherlock/{head_commit_sha}/sherlock_project/resources/data.json"
sites = SitesInformation(json_file_location)
sites = SitesInformation(
data_file_path=json_file_location,
honor_exclusions=not args.ignore_exclusions,
do_not_exclude=args.site_list,
)
except Exception as error:
print(f"ERROR: {error}")
sys.exit(1)
@ -858,8 +825,6 @@ def main():
username,
site_data,
query_notify,
tor=args.tor,
unique_tor=args.unique_tor,
dump_response=args.dump_response,
proxy=args.proxy,
timeout=args.timeout,
@ -875,7 +840,7 @@ def main():
else:
result_file = f"{username}.txt"
if not args.no_txt:
if args.output_txt:
with open(result_file, "w", encoding="utf-8") as file:
exists_counter = 0
for website_name in results:

View File

@ -7,6 +7,10 @@ import json
import requests
import secrets
MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt"
class SiteInformation:
def __init__(self, name, url_home, url_username_format, username_claimed,
information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)):
@ -67,12 +71,17 @@ class SiteInformation:
Return Value:
Nicely formatted string to get information about this object.
"""
return f"{self.name} ({self.url_home})"
class SitesInformation:
def __init__(self, data_file_path=None):
def __init__(
self,
data_file_path: str|None = None,
honor_exclusions: bool = True,
do_not_exclude: list[str] = [],
):
"""Create Sites Information Object.
Contains information about all supported websites.
@ -110,7 +119,7 @@ class SitesInformation:
# The default data file is the live data.json which is in the GitHub repo. The reason why we are using
# this instead of the local one is so that the user has the most up-to-date data. This prevents
# users from creating issue about false positives which has already been fixed or having outdated data
data_file_path = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
data_file_path = MANIFEST_URL
# Ensure that specified data file has correct extension.
if not data_file_path.lower().endswith(".json"):
@ -120,7 +129,7 @@ class SitesInformation:
if data_file_path.lower().startswith("http"):
# Reference is to a URL.
try:
response = requests.get(url=data_file_path)
response = requests.get(url=data_file_path, timeout=30)
except Exception as error:
raise FileNotFoundError(
f"Problem while attempting to access data file URL '{data_file_path}': {error}"
@ -152,9 +161,31 @@ class SitesInformation:
raise FileNotFoundError(f"Problem while attempting to access "
f"data file '{data_file_path}'."
)
site_data.pop('$schema', None)
if honor_exclusions:
try:
response = requests.get(url=EXCLUSIONS_URL, timeout=10)
if response.status_code == 200:
exclusions = response.text.splitlines()
exclusions = [exclusion.strip() for exclusion in exclusions]
for site in do_not_exclude:
if site in exclusions:
exclusions.remove(site)
for exclusion in exclusions:
try:
site_data.pop(exclusion, None)
except KeyError:
pass
except Exception:
# If there was any problem loading the exclusions, just continue without them
print("Warning: Could not load exclusions, continuing without them.")
honor_exclusions = False
self.sites = {}
# Add all site information from the json file to internal site list.
@ -194,7 +225,7 @@ class SitesInformation:
for site in self.sites:
if self.sites[site].is_nsfw and site.casefold() not in do_not_remove:
continue
sites[site] = self.sites[site]
sites[site] = self.sites[site]
self.sites = sites
def site_name_list(self):

View File

@ -4,6 +4,11 @@ import urllib
import pytest
from sherlock_project.sites import SitesInformation
def fetch_local_manifest(honor_exclusions: bool = True) -> dict[str, dict[str, str]]:
sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"), honor_exclusions=honor_exclusions)
sites_iterable: dict[str, dict[str, str]] = {site.name: site.information for site in sites_obj}
return sites_iterable
@pytest.fixture()
def sites_obj():
sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
@ -11,9 +16,7 @@ def sites_obj():
@pytest.fixture(scope="session")
def sites_info():
sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
sites_iterable = {site.name: site.information for site in sites_obj}
yield sites_iterable
yield fetch_local_manifest()
@pytest.fixture(scope="session")
def remote_schema():
@ -21,3 +24,28 @@ def remote_schema():
with urllib.request.urlopen(schema_url) as remoteschema:
schemadat = json.load(remoteschema)
yield schemadat
def pytest_addoption(parser):
parser.addoption(
"--chunked-sites",
action="store",
default=None,
help="For tests utilizing chunked sites, include only the (comma-separated) site(s) specified.",
)
def pytest_generate_tests(metafunc):
if "chunked_sites" in metafunc.fixturenames:
sites_info = fetch_local_manifest(honor_exclusions=False)
# Ingest and apply site selections
site_filter: str | None = metafunc.config.getoption("--chunked-sites")
if site_filter:
selected_sites: list[str] = [site.strip() for site in site_filter.split(",")]
sites_info = {
site: data for site, data in sites_info.items()
if site in selected_sites
}
params = [{name: data} for name, data in sites_info.items()]
ids = list(sites_info.keys())
metafunc.parametrize("chunked_sites", params, ids=ids)

View File

@ -7,7 +7,7 @@ def test_validate_manifest_against_local_schema():
"""Ensures that the manifest matches the local schema, for situations where the schema is being changed."""
json_relative: str = '../sherlock_project/resources/data.json'
schema_relative: str = '../sherlock_project/resources/data.schema.json'
json_path: str = os.path.join(os.path.dirname(__file__), json_relative)
schema_path: str = os.path.join(os.path.dirname(__file__), schema_relative)

View File

@ -0,0 +1,99 @@
import pytest
import re
import rstr
from sherlock_project.sherlock import sherlock
from sherlock_project.notify import QueryNotify
from sherlock_project.result import QueryResult, QueryStatus
FALSE_POSITIVE_ATTEMPTS: int = 2 # Since the usernames are randomly generated, it's POSSIBLE that a real username can be hit
FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND: int = 15 # If a pattern uses quantifiers such as `+` `*` or `{n,}`, limit the upper bound (0 to disable)
FALSE_POSITIVE_DEFAULT_PATTERN: str = r'^[a-zA-Z0-9]{7,20}$' # Used in absence of a regexCheck entry
def set_pattern_upper_bound(pattern: str, upper_bound: int = FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND) -> str:
"""Set upper bound for regex patterns that use quantifiers such as `+` `*` or `{n,}`."""
def replace_upper_bound(match: re.Match) -> str: # type: ignore
lower_bound: int = int(match.group(1)) if match.group(1) else 0 # type: ignore
upper_bound = upper_bound if lower_bound < upper_bound else lower_bound # type: ignore # noqa: F823
return f'{{{lower_bound},{upper_bound}}}'
pattern = re.sub(r'(?<!\\)\{(\d+),\}', replace_upper_bound, pattern) # {n,} # type: ignore
pattern = re.sub(r'(?<!\\)\+', f'{{1,{upper_bound}}}', pattern) # +
pattern = re.sub(r'(?<!\\)\*', f'{{0,{upper_bound}}}', pattern) # *
return pattern
def false_positive_check(sites_info: dict[str, dict[str, str]], site: str, pattern: str) -> QueryStatus:
"""Check if a site is likely to produce false positives."""
status: QueryStatus = QueryStatus.UNKNOWN
for _ in range(FALSE_POSITIVE_ATTEMPTS):
query_notify: QueryNotify = QueryNotify()
username: str = rstr.xeger(pattern)
result: QueryResult | str = sherlock(
username=username,
site_data=sites_info,
query_notify=query_notify,
)[site]['status']
if not hasattr(result, 'status'):
raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}")
if type(result.status) is not QueryStatus: # type: ignore
raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore
status = result.status # type: ignore
if status in (QueryStatus.AVAILABLE, QueryStatus.WAF):
return status
return status
def false_negative_check(sites_info: dict[str, dict[str, str]], site: str) -> QueryStatus:
"""Check if a site is likely to produce false negatives."""
status: QueryStatus = QueryStatus.UNKNOWN
query_notify: QueryNotify = QueryNotify()
result: QueryResult | str = sherlock(
username=sites_info[site]['username_claimed'],
site_data=sites_info,
query_notify=query_notify,
)[site]['status']
if not hasattr(result, 'status'):
raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}")
if type(result.status) is not QueryStatus: # type: ignore
raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore
status = result.status # type: ignore
return status
@pytest.mark.validate_targets
@pytest.mark.online
class Test_All_Targets:
@pytest.mark.validate_targets_fp
def test_false_pos(self, chunked_sites: dict[str, dict[str, str]]):
"""Iterate through all sites in the manifest to discover possible false-positive inducting targets."""
pattern: str
for site in chunked_sites:
try:
pattern = chunked_sites[site]['regexCheck']
except KeyError:
pattern = FALSE_POSITIVE_DEFAULT_PATTERN
if FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND > 0:
pattern = set_pattern_upper_bound(pattern)
result: QueryStatus = false_positive_check(chunked_sites, site, pattern)
assert result is QueryStatus.AVAILABLE, f"{site} produced false positive with pattern {pattern}, result was {result}"
@pytest.mark.validate_targets_fn
def test_false_neg(self, chunked_sites: dict[str, dict[str, str]]):
"""Iterate through all sites in the manifest to discover possible false-negative inducting targets."""
for site in chunked_sites:
result: QueryStatus = false_negative_check(chunked_sites, site)
assert result is QueryStatus.CLAIMED, f"{site} produced false negative, result was {result}"

View File

@ -7,8 +7,6 @@ envlist =
py312
py311
py310
py39
py38
[testenv]
description = Attempt to build and install the package
@ -16,6 +14,7 @@ deps =
coverage
jsonschema
pytest
rstr
allowlist_externals = coverage
commands =
coverage run --source=sherlock_project --module pytest -v
@ -37,7 +36,7 @@ commands =
[gh-actions]
python =
3.13: py313
3.12: py312
3.11: py311
3.10: py310
3.9: py39