i did a TON of things

2026-02-24 01:40:38 -06:00
parent 2479e86dfe
commit 4089d82c41
5 changed files with 6858 additions and 4344 deletions
--- a/extract_comments.py
+++ b/extract_comments.py
@@ -1,15 +1,30 @@
 import json
+import os
 from yt_dlp import YoutubeDL

 url = "https://www.youtube.com/watch?v=lW7vWfRI5oI"
+original_path = "docs/comments_ORIGINAL.json" # the original, authentic comments from before cutoff (in case they get edited/deleted one day)
 output_path = "docs/comments.json"
+CUTOFF = 1767247567  # Jan 1 2026 UTC at SIX SEVEN

-with YoutubeDL({"skip_download": True, "getcomments": True}) as derexXD:
-    info = derexXD.extract_info(url, download=False)
+# Load the frozen original comments (pre-cutoff, never modified)
+original = json.load(open(original_path, encoding="utf-8")) if os.path.exists(original_path) else []
+# ^^ this is a LONG ternary btw

-comments = sorted(info.get("comments", []), key=lambda x: x.get("timestamp", 0))
+with YoutubeDL({"skip_download": True, "getcomments": True}) as ydl:
+    scraped = ydl.extract_info(url, download=False).get("comments", [])

-with open(output_path, "w") as file:
-    json.dump(comments, file, indent=2)
+new_by_id = {}
+for c in scraped:
+    if c.get("timestamp", 0) >= CUTOFF and c.get("id"):
+        new_by_id[c["id"]] = c

-print(f"Done\nExtracted {len(comments)} comments from el video (sorted oldest to newest)")
+# we'll grab comments that are after the cutoff, those are what will be appended
+
+merged = original + sorted(new_by_id.values(), key=lambda x: x.get("timestamp", 0))
+
+with open(output_path, "w", encoding="utf-8") as f:
+    json.dump(merged, f, indent=2, ensure_ascii=False)
+
+
+print(f"Done\n{len(original)} original comments + {len(new_by_id)} new comments = {len(merged)} total")