wip: Add initial pakjoy.py

2023-10-21 00:32:13 +02:00 · 2023-10-21 00:32:13 +02:00 · 238a0fa2d1
parent 193b5a50a7
commit 238a0fa2d1
2 changed files with 202 additions and 0 deletions
--- a/qutebrowser/misc/binparsing.py
+++ b/qutebrowser/misc/binparsing.py
@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: Florian Bruhin (The-Compiler) <mail@qutebrowser.org>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+"""Utilities for parsing binary files.
+
+Used by elf.py as well as pakjoy.py.
+"""
+
+import struct
+from typing import Any, IO, Tuple
+
+
+class ParseError(Exception):
+
+    """Raised when the file can't be parsed."""
+
+
+def unpack(fmt: str, fobj: IO[bytes]) -> Tuple[Any, ...]:
+    """Unpack the given struct format from the given file."""
+    size = struct.calcsize(fmt)
+    data = safe_read(fobj, size)
+
+    try:
+        return struct.unpack(fmt, data)
+    except struct.error as e:
+        raise ParseError(e)
+
+
+def safe_read(fobj: IO[bytes], size: int) -> bytes:
+    """Read from a file, handling possible exceptions."""
+    try:
+        return fobj.read(size)
+    except (OSError, OverflowError) as e:
+        raise ParseError(e)
+
+
+def safe_seek(fobj: IO[bytes], pos: int) -> None:
+    """Seek in a file, handling possible exceptions."""
+    try:
+        fobj.seek(pos)
+    except (OSError, OverflowError) as e:
+        raise ParseError(e)
+
--- a/qutebrowser/misc/pakjoy.py
+++ b/qutebrowser/misc/pakjoy.py
@ -0,0 +1,158 @@
+
+# SPDX-FileCopyrightText: Florian Bruhin (The-Compiler) <mail@qutebrowser.org>
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+"""Chromium .pak repacking.
+
+This entire file is a great WORKAROUND for https://bugreports.qt.io/browse/QTBUG-118157
+and the fact we can't just simply disable the hangouts extension:
+https://bugreports.qt.io/browse/QTBUG-118452
+
+It's yet another big hack. If you think this is bad, look at elf.py instead.
+
+The name of this file might or might not be inspired by a certain vegetable,
+as well as the "joy" this bug has caused me.
+
+Useful references:
+
+- https://sweetscape.com/010editor/repository/files/PAK.bt (010 editor <3)
+- https://textslashplain.com/2022/05/03/chromium-internals-pak-files/
+- https://github.com/myfreeer/chrome-pak-customizer
+- https://source.chromium.org/chromium/chromium/src/+/main:tools/grit/pak_util.py
+- https://source.chromium.org/chromium/chromium/src/+/main:tools/grit/grit/format/data_pack.py
+
+This is a "best effort" parser. If it errors out, we don't apply the workaround
+instead of crashing.
+"""
+
+import dataclasses
+from typing import ClassVar, IO, Optional, Dict, Tuple
+
+from qutebrowser.misc import binparsing
+
+HANGOUTS_MARKER = b"// Extension ID: nkeimhogjdpnpccoofpliimaahmaaome"
+HANGOUTS_ID = 36197  # as found by toofar
+
+TARGET_URL = b"https://*.google.com/*"
+REPLACEMENT_URL = b"https://*.qb.invalid/*"
+assert len(TARGET_URL) == len(REPLACEMENT_URL)
+
+
+@dataclasses.dataclass
+class Pak5Header:
+
+    """Chromium .pak header."""
+
+    encoding: int  # uint32
+    resource_count: int  # uint16
+    alias_count: int  # uint16
+
+    _FORMAT: ClassVar[str] = '<IHH'
+
+    @classmethod
+    def parse(cls, fobj: IO[bytes]) -> 'Pak5Header':
+        """Parse a PAK version 5 header from a file."""
+        return cls(*binparsing.unpack(cls._FORMAT, fobj))
+
+
+@dataclasses.dataclass
+class PakEntry:
+
+    """Entry description in a .pak file"""
+
+    resource_id: int  # uint16
+    file_offset: int  # uint32
+    size: int = 0  # not in file
+
+    _FORMAT: ClassVar[str] = '<HI'
+
+    @classmethod
+    def parse(cls, fobj: IO[bytes]) -> 'PakEntry':
+        """Parse a PAK entry from a file."""
+        return cls(*binparsing.unpack(cls._FORMAT, fobj))
+
+
+class PakParser:
+
+    def __init__(self, fobj: IO[bytes]) -> None:
+        """Parse the .pak file from the given file object."""
+        version = binparsing.unpack("<I", fobj)[0]
+        if version != 5:
+            raise binparsing.ParseError(f"Unsupported .pak version {version}")
+
+        self.fobj = fobj
+        entries = self._read_header()
+        self.manifest_entry, self.manifest = self._find_manifest(entries)
+
+    def find_patch_offset(self) -> int:
+        try:
+            return self.manifest_entry.file_offset + self.manifest.index(TARGET_URL)
+        except ValueError:
+            raise binparsing.ParseError("Couldn't find URL in manifest")
+
+    def _maybe_get_hangouts_manifest(self, entry: PakEntry) -> Optional[bytes]:
+        self.fobj.seek(entry.file_offset)
+        data = self.fobj.read(entry.size)
+
+        if not data.startswith(b"{") or not data.rstrip(b"\n").endswith(b"}"):
+            # not JSON
+            return None
+
+        if HANGOUTS_MARKER not in data:
+            return None
+
+        return data
+
+    def _read_header(self) -> Dict[int, PakEntry]:
+        """Read the header and entry index from the .pak file."""
+        entries = []
+
+        header = Pak5Header.parse(self.fobj)
+        for _ in range(header.resource_count + 1):  # + 1 due to sentinel at end
+            entries.append(PakEntry.parse(self.fobj))
+
+        for entry, next_entry in zip(entries, entries[1:]):
+            if entry.resource_id == 0:
+                raise binparsing.ParseError("Unexpected sentinel entry")
+            entry.size = next_entry.file_offset - entry.file_offset
+
+        if entries[-1].resource_id != 0:
+            raise binparsing.ParseError("Missing sentinel entry")
+        del entries[-1]
+
+        return {entry.resource_id: entry for entry in entries}
+
+    def _find_manifest(self, entries: Dict[int, PakEntry]) -> Tuple[PakEntry, str]:
+        if HANGOUTS_ID in entries:
+            suspected_entry = entries[HANGOUTS_ID]
+            manifest = self._maybe_get_hangouts_manifest(suspected_entry)
+            if manifest is not None:
+                return suspected_entry, manifest
+
+        # didn't find it via the prevously known ID, let's search them all...
+        for entry in entries:
+            manifest = self._maybe_get_hangouts_manifest(entry)
+            if manifest is not None:
+                return entry, manifest
+
+        raise binparsing.ParseError("Couldn't find hangouts manifest")
+
+
+if __name__ == "__main__":
+    import shutil
+    shutil.copy("/usr/share/qt6/resources/qtwebengine_resources.pak", "/tmp/test.pak")
+
+    with open("/tmp/test.pak", "r+b") as f:
+        parser = PakParser(f)
+        print(parser.manifest_entry)
+        print(parser.manifest)
+        offset = parser.find_patch_offset()
+        f.seek(offset)
+        f.write(REPLACEMENT_URL)
+
+    with open("/tmp/test.pak", "rb") as f:
+        parser = PakParser(f)
+
+    print(parser.manifest_entry)
+    print(parser.manifest)