Merge pull request #2549 from sherlock-project/add/instapaper

feat: add instapaper
2025-09-20 18:13:30 -04:00 · 2025-09-20 18:13:30 -04:00 · e5e0da00fe
parent 9f5b7e1846 dc61cdc7a4
commit e5e0da00fe
4 changed files with 126 additions and 8 deletions
--- a/.github/workflows/validate_modified_targets.yml
+++ b/.github/workflows/validate_modified_targets.yml
@ -14,6 +14,7 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@v5
        with:
+          ref: ${{ github.event.pull_request.head.sha }}
          fetch-depth: 0

      - name: Set up Python
@ -38,13 +39,21 @@ jobs:

          # Discover changes
          git show origin/${{ github.base_ref }}:sherlock_project/resources/data.json > data.json.base
+          cp sherlock_project/resources/data.json data.json.head
+
          CHANGED=$(
-            jq -r --slurpfile base data.json.base --slurpfile head sherlock_project/resources/data.json '
-              [
-                ($head[0] | keys_unsorted[]) as $key
-                | select(($base[0][$key] != $head[0][$key]) or ($base[0][$key] | not))
-                | $key
-              ] | unique | join(",")'
+            python - <<'EOF'
+          import json
+          with open("data.json.base") as f: base = json.load(f)
+          with open("data.json.head") as f: head = json.load(f)
+
+          changed = []
+          for k, v in head.items():
+              if k not in base or base[k] != v:
+                  changed.append(k)
+
+          print(",".join(sorted(changed)))
+          EOF
          )

          # Preserve changelist
@ -53,12 +62,38 @@ jobs:

      - name: Validate modified targets
        if: steps.discover-modified.outputs.changed_targets != ''
+        continue-on-error: true
        run: |
          $(poetry env activate)
-          pytest -q --tb no -rA -m validate_targets -n 20 --chunked-sites "${{ steps.discover-modified.outputs.changed_targets }}"
+          pytest -q --tb no -rA -m validate_targets -n 20 --chunked-sites "${{ steps.discover-modified.outputs.changed_targets }}" --junitxml=validation_results.xml
          deactivate

-      - name: Announce skip if no modified targets
+      - name: Prepare validation summary
+        if: steps.discover-modified.outputs.changed_targets != ''
+        id: prepare-summary
+        run: |
+          $(poetry env activate)
+          summary=$(
+            python devel/summarize_site_validation.py validation_results.xml || echo "Failed to generate summary of test results"
+          )
+          deactivate
+          echo "$summary" > validation_summary.md
+
+      - name: Announce validation results
+        if: steps.discover-modified.outputs.changed_targets != ''
+        uses: actions/github-script@v8
+        with:
+          script: |
+            const fs = require('fs');
+            const body = fs.readFileSync('validation_summary.md', 'utf8');
+            github.rest.issues.createComment({
+              issue_number: context.payload.pull_request.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: body,
+            });
+
+      - name: This step shows as ran when no modifications are found
        if: steps.discover-modified.outputs.changed_targets == ''
        run: |
          echo "No modified targets found"
--- a/devel/summarize_site_validation.py
+++ b/devel/summarize_site_validation.py
@ -0,0 +1,72 @@
+#!/usr/bin/env python
+# This module summarizes the results of site validation tests queued by
+# workflow validate_modified_targets for presentation in Issue comments.
+
+from defusedxml import ElementTree as ET
+import sys
+from pathlib import Path
+
+def summarize_junit_xml(xml_path: Path) -> str:
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    suite = root.find('testsuite')
+
+    pass_message: str = ":heavy_check_mark: &nbsp; Pass"
+    fail_message: str = ":x: &nbsp; Fail"
+
+    if suite is None:
+        raise ValueError("Invalid JUnit XML: No testsuite found")
+
+    summary_lines: list[str] = []
+    summary_lines.append("#### Automatic validation of changes\n")
+    summary_lines.append("| Target | F+ Check | F- Check |")
+    summary_lines.append("|---|---|---|")
+
+    failures = int(suite.get('failures', 0))
+    errors_detected: bool = False
+
+    results: dict[str, dict[str, str]] = {}
+
+    for testcase in suite.findall('testcase'):
+        test_name = testcase.get('name').split('[')[0]
+        site_name = testcase.get('name').split('[')[1].rstrip(']')
+        failure = testcase.find('failure')
+        error = testcase.find('error')
+
+        if site_name not in results:
+            results[site_name] = {}
+
+        if test_name == "test_false_neg":
+            results[site_name]['F- Check'] = pass_message if failure is None and error is None else fail_message
+        elif test_name == "test_false_pos":
+            results[site_name]['F+ Check'] = pass_message if failure is None and error is None else fail_message
+
+        if error is not None:
+            errors_detected = True
+
+    for result in results:
+        summary_lines.append(f"| {result} | {results[result].get('F+ Check', 'Error!')} | {results[result].get('F- Check', 'Error!')} |")
+
+    if failures > 0:
+        summary_lines.append("\n___\n" +
+            "\nFailures were detected on at least one updated target. Commits containing accuracy failures" +
+            " will often not be merged (unless a rationale is provided, such as false negatives due to regional differences).")
+
+    if errors_detected:
+        summary_lines.append("\n___\n" +
+            "\n**Errors were detected during validation. Please review the workflow logs.**")
+
+    return "\n".join(summary_lines)
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: summarize_site_validation.py <junit-xml-file>")
+        sys.exit(1)
+
+    xml_path: Path = Path(sys.argv[1])
+    if not xml_path.is_file():
+        print(f"Error: File '{xml_path}' does not exist.")
+        sys.exit(1)
+
+    summary: str = summarize_junit_xml(xml_path)
+    print(summary)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -60,5 +60,9 @@ rstr = "^3.2.2"
 pytest = "^8.4.2"
 pytest-xdist = "^3.8.0"

+
+[tool.poetry.group.ci.dependencies]
+defusedxml = "^0.7.1"
+
 [tool.poetry.scripts]
 sherlock = 'sherlock_project.sherlock:main'
--- a/sherlock_project/resources/data.json
+++ b/sherlock_project/resources/data.json
@ -1142,6 +1142,13 @@
    "urlProbe": "https://imginn.com/{}",
    "username_claimed": "instagram"
  },
+  "Instapaper": {
+    "errorType": "status_code",
+    "request_method": "GET",
+    "url": "https://www.instapaper.com/p/{}",
+    "urlMain": "https://www.instapaper.com/",
+    "username_claimed": "john"
+  },
  "Instructables": {
    "errorType": "status_code",
    "url": "https://www.instructables.com/member/{}",