- Markenrecht-Monitoring (Python) — täglich URLs capturen - Webhook-Receiver (Node.js Express) — capture.ready Events archivieren - WordPress-Plugin (PHP) — Captures aus dem WP-Backend
108 lines
3.5 KiB
Python
108 lines
3.5 KiB
Python
"""
|
|
Markenrecht-Monitoring mit hightrusted CAPTURE.
|
|
|
|
Liest eine Liste von URLs aus einer YAML-Datei, erstellt für jede URL ein
|
|
Capture mit qualifiziertem Zeitstempel und legt die PDFs in einem strukturierten
|
|
Archiv-Ordner ab. Optional: Versand einer Zusammenfassung per E-Mail.
|
|
|
|
Use-Case:
|
|
Du betreibst eine Marke und beobachtest, ob Wettbewerber/Trittbrettfahrer
|
|
deine Marke missbrauchen. Sobald sie es bemerken und löschen, hast du
|
|
bereits ein gerichtsverwertbares Capture.
|
|
|
|
Voraussetzung:
|
|
pip install hightrusted-capture pyyaml
|
|
|
|
Lauf:
|
|
export HIGHTRUSTED_API_KEY=ht_live_...
|
|
python monitor.py urls.yaml ./archiv
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
from hightrusted_capture import (
|
|
Client,
|
|
HightrustedError,
|
|
QuotaExceededError,
|
|
UnreachableUrlError,
|
|
)
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)-7s %(message)s",
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
)
|
|
log = logging.getLogger("markenrecht-monitor")
|
|
|
|
|
|
def main(urls_file: Path, archive_root: Path) -> int:
|
|
config = yaml.safe_load(urls_file.read_text())
|
|
urls: list[dict] = config["urls"]
|
|
|
|
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
target_dir = archive_root / today
|
|
target_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
client = Client() # API-Key aus ENV
|
|
summary = {"ok": [], "failed": []}
|
|
|
|
for entry in urls:
|
|
url = entry["url"]
|
|
case = entry.get("case", "default")
|
|
log.info("→ %s [%s]", url, case)
|
|
|
|
try:
|
|
capture = client.capture(
|
|
url=url,
|
|
reference=f"monitor:{case}:{today}",
|
|
viewport={"width": 1920, "height": 1080},
|
|
)
|
|
pdf_path = target_dir / f"{case}_{capture['id'][:8]}.pdf"
|
|
client.download_pdf(capture["id"], pdf_path)
|
|
|
|
summary["ok"].append({
|
|
"url": url,
|
|
"case": case,
|
|
"capture_id": capture["id"],
|
|
"verify_url": capture["verify_url"],
|
|
"pdf": str(pdf_path),
|
|
"timestamp": capture["timestamp"]["issued_at"],
|
|
})
|
|
log.info(" ✓ %s", pdf_path.name)
|
|
|
|
except UnreachableUrlError as e:
|
|
log.warning(" ✗ Quelle nicht erreichbar: %s", e.message)
|
|
summary["failed"].append({"url": url, "case": case, "reason": "unreachable"})
|
|
|
|
except QuotaExceededError:
|
|
log.error("Monats-Quota erschöpft. Abbruch.")
|
|
return 2
|
|
|
|
except HightrustedError as e:
|
|
log.error(" ✗ %s (request_id=%s)", e.message, e.request_id)
|
|
summary["failed"].append({"url": url, "case": case, "reason": e.code})
|
|
|
|
# Audit-Index schreiben
|
|
index_path = target_dir / "index.yaml"
|
|
index_path.write_text(yaml.safe_dump(summary, allow_unicode=True, sort_keys=False))
|
|
log.info("Index gespeichert: %s", index_path)
|
|
|
|
log.info("─" * 60)
|
|
log.info("✓ %d erfolgreich, ✗ %d fehlgeschlagen", len(summary["ok"]), len(summary["failed"]))
|
|
|
|
return 0 if not summary["failed"] else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description=__doc__.split("\n")[1])
|
|
parser.add_argument("urls_file", type=Path, help="YAML mit zu überwachenden URLs")
|
|
parser.add_argument("archive_root", type=Path, help="Wohin die PDFs gespeichert werden")
|
|
args = parser.parse_args()
|
|
sys.exit(main(args.urls_file, args.archive_root))
|