examples/markenrecht-monitoring/monitor.py

"""
Markenrecht-Monitoring mit hightrusted CAPTURE.

Liest eine Liste von URLs aus einer YAML-Datei, erstellt für jede URL ein
Capture mit qualifiziertem Zeitstempel und legt die PDFs in einem strukturierten
Archiv-Ordner ab. Optional: Versand einer Zusammenfassung per E-Mail.

Use-Case:
    Du betreibst eine Marke und beobachtest, ob Wettbewerber/Trittbrettfahrer
    deine Marke missbrauchen. Sobald sie es bemerken und löschen, hast du
    bereits ein gerichtsverwertbares Capture.

Voraussetzung:
    pip install hightrusted-capture pyyaml

Lauf:
    export HIGHTRUSTED_API_KEY=ht_live_...
    python monitor.py urls.yaml ./archiv
"""

from __future__ import annotations

import argparse
import logging
import sys
from datetime import datetime, timezone
from pathlib import Path

import yaml
from hightrusted_capture import (
    Client,
    HightrustedError,
    QuotaExceededError,
    UnreachableUrlError,
)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)-7s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger("markenrecht-monitor")


def main(urls_file: Path, archive_root: Path) -> int:
    config = yaml.safe_load(urls_file.read_text())
    urls: list[dict] = config["urls"]

    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    target_dir = archive_root / today
    target_dir.mkdir(parents=True, exist_ok=True)

    client = Client()  # API-Key aus ENV
    summary = {"ok": [], "failed": []}

    for entry in urls:
        url = entry["url"]
        case = entry.get("case", "default")
        log.info("→ %s [%s]", url, case)

        try:
            capture = client.capture(
                url=url,
                reference=f"monitor:{case}:{today}",
                viewport={"width": 1920, "height": 1080},
            )
            pdf_path = target_dir / f"{case}_{capture['id'][:8]}.pdf"
            client.download_pdf(capture["id"], pdf_path)

            summary["ok"].append({
                "url": url,
                "case": case,
                "capture_id": capture["id"],
                "verify_url": capture["verify_url"],
                "pdf": str(pdf_path),
                "timestamp": capture["timestamp"]["issued_at"],
            })
            log.info("   ✓ %s", pdf_path.name)

        except UnreachableUrlError as e:
            log.warning("   ✗ Quelle nicht erreichbar: %s", e.message)
            summary["failed"].append({"url": url, "case": case, "reason": "unreachable"})

        except QuotaExceededError:
            log.error("Monats-Quota erschöpft. Abbruch.")
            return 2

        except HightrustedError as e:
            log.error("   ✗ %s (request_id=%s)", e.message, e.request_id)
            summary["failed"].append({"url": url, "case": case, "reason": e.code})

    # Audit-Index schreiben
    index_path = target_dir / "index.yaml"
    index_path.write_text(yaml.safe_dump(summary, allow_unicode=True, sort_keys=False))
    log.info("Index gespeichert: %s", index_path)

    log.info("─" * 60)
    log.info("✓ %d erfolgreich, ✗ %d fehlgeschlagen", len(summary["ok"]), len(summary["failed"]))

    return 0 if not summary["failed"] else 1


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__.split("\n")[1])
    parser.add_argument("urls_file", type=Path, help="YAML mit zu überwachenden URLs")
    parser.add_argument("archive_root", type=Path, help="Wohin die PDFs gespeichert werden")
    args = parser.parse_args()
    sys.exit(main(args.urls_file, args.archive_root))