test: prepare false positive detection base

2025-09-14 00:39:35 -04:00 · 2025-09-14 00:39:35 -04:00 · ca094d8264
parent 5113dcfb36
commit ca094d8264
7 changed files with 76 additions and 8 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -56,6 +56,9 @@ tor = ["torrequest"]

 [tool.poetry.group.dev.dependencies]
 jsonschema = "^4.0.0"
+rstr = "^3.2.2"
+pytest = "^8.4.2"
+pytest-xdist = "^3.8.0"

 [tool.poetry.scripts]
 sherlock = 'sherlock_project.sherlock:main'
--- a/pytest.ini
+++ b/pytest.ini
@ -1,4 +1,5 @@
 [pytest]
-addopts = --strict-markers
+addopts = --strict-markers -m "not validate_targets"
 markers =
    online: mark tests are requiring internet access.
+    validate_targets: mark tests for sweeping manifest validation (sends many requests).
--- a/sherlock_project/sherlock.py
+++ b/sherlock_project/sherlock.py
@ -169,14 +169,14 @@ def multiple_usernames(username):

 def sherlock(
    username: str,
-    site_data: dict,
+    site_data: dict[str, dict[str, str]],
    query_notify: QueryNotify,
    tor: bool = False,
    unique_tor: bool = False,
    dump_response: bool = False,
    proxy: Optional[str] = None,
    timeout: int = 60,
-):
+) -> dict[str, dict[str, str | QueryResult]]:
    """Run Sherlock Analysis.

    Checks for existence of username on various social media sites.
@ -507,7 +507,7 @@ def sherlock(
            print("+++++++++++++++++++++")

        # Notify caller about results of query.
-        result = QueryResult(
+        result: QueryResult = QueryResult(
            username=username,
            site_name=social_network,
            site_url_user=url,
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -4,6 +4,11 @@ import urllib
 import pytest
 from sherlock_project.sites import SitesInformation

+def fetch_local_manifest() -> dict[str, dict[str, str]]:
+    sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
+    sites_iterable = {site.name: site.information for site in sites_obj}
+    return sites_iterable
+
@pytest.fixture()
 def sites_obj():
    sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
@ -11,9 +16,7 @@ def sites_obj():

@pytest.fixture(scope="session")
 def sites_info():
-    sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"))
-    sites_iterable = {site.name: site.information for site in sites_obj}
-    yield sites_iterable
+    yield fetch_local_manifest()

@pytest.fixture(scope="session")
 def remote_schema():
@ -21,3 +24,10 @@ def remote_schema():
    with urllib.request.urlopen(schema_url) as remoteschema:
        schemadat = json.load(remoteschema)
    yield schemadat
+
+def pytest_generate_tests(metafunc):
+    if "chunked_sites" in metafunc.fixturenames:
+        sites_info = fetch_local_manifest()
+        params = [{name: data} for name, data in sites_info.items()]
+        ids = list(sites_info.keys())
+        metafunc.parametrize("chunked_sites", params, ids=ids)
--- a/tests/test_manifest.py
+++ b/tests/test_manifest.py
@ -7,7 +7,7 @@ def test_validate_manifest_against_local_schema():
    """Ensures that the manifest matches the local schema, for situations where the schema is being changed."""
    json_relative: str = '../sherlock_project/resources/data.json'
    schema_relative: str = '../sherlock_project/resources/data.schema.json'
-    
+
    json_path: str = os.path.join(os.path.dirname(__file__), json_relative)
    schema_path: str = os.path.join(os.path.dirname(__file__), schema_relative)

--- a/tests/test_validate_targets.py
+++ b/tests/test_validate_targets.py
@ -0,0 +1,53 @@
+import pytest
+import rstr
+
+from sherlock_project.sherlock import sherlock
+from sherlock_project.notify import QueryNotify
+from sherlock_project.result import QueryResult, QueryStatus
+
+
+FALSE_POSITIVE_ATTEMPTS: int = 2    # Since the usernames are randomly generated, it's POSSIBLE that a real username can be hit
+
+
+def false_positive_check(sites_info: dict[str, dict[str, str]], site: str, pattern: str) -> QueryStatus:
+    """Check if a site is likely to produce false positives."""
+    attempts: int = 1
+    status: QueryStatus = QueryStatus.UNKNOWN
+
+    for _ in range(attempts):
+        query_notify = QueryNotify()
+        username: str = rstr.xeger(pattern)
+
+        result: QueryResult | str = sherlock(
+            username=username,
+            site_data=sites_info,
+            query_notify=query_notify,
+        )[site]['status']
+
+        if not hasattr(result, 'status'):
+            raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}")
+        if type(result.status) is not QueryStatus: # type: ignore
+            raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore
+        status = result.status # type: ignore
+
+        if status in (QueryStatus.AVAILABLE, QueryStatus.WAF):
+            return status
+
+    return status
+
+
+@pytest.mark.validate_targets
+@pytest.mark.online
+class Test_All_Targets:
+
+    def test_manifest_false_pos(self, chunked_sites: dict[str, dict[str, str]]):
+        """Ensures that the manifest matches the local schema, for situations where the schema is being changed."""
+        pattern: str
+        for site in chunked_sites:
+            try:
+                pattern = chunked_sites[site]['regexCheck']
+            except KeyError:
+                pattern = r'^[a-zA-Z0-9._-]{7,20}$'
+            result: QueryStatus = false_positive_check(chunked_sites, site, pattern)
+            assert result is QueryStatus.AVAILABLE, f"{site} produced false positive with pattern {pattern}, result was {result}"
+
--- a/tox.ini
+++ b/tox.ini
@ -16,6 +16,7 @@ deps =
    coverage
    jsonschema
    pytest
+    rstr
 allowlist_externals = coverage
 commands =
    coverage run --source=sherlock_project --module pytest -v