added a script to check for dupe comments

2026-02-24 01:50:27 -06:00
parent 4089d82c41
commit fd2b3c8480
5 changed files with 48 additions and 0 deletions
--- a/scripts/check_dupes.py
+++ b/scripts/check_dupes.py
@@ -0,0 +1,14 @@
+import json
+
+comments = json.load(open("docs/comments.json", encoding="utf-8"))
+seen, dupes = {}, 0
+
+for i, c in enumerate(comments):
+    k = (c.get("id",""), c.get("text",""), c.get("timestamp",0))
+    if k in seen: 
+        dupes += 1; 
+        print(f"  Index {i} dupes {seen[k]} | id={k[0]} | ts={k[2]} | text={k[1][:80]}")
+    else: 
+        seen[k] = i
+
+print(f"Found {dupes} duplicate(s)" if dupes else f"No duplicates among us.")
--- a/scripts/check_dupes.rs
+++ b/scripts/check_dupes.rs
@@ -0,0 +1,34 @@
+use std::{collections::HashMap, fs};
+use serde_json::Value;
+
+fn main() {
+  let comments: Vec<Value> =
+    serde_json::from_str(&fs::read_to_string("docs/comments.json").unwrap()).unwrap();
+  let mut seen: HashMap<(String, String, i64), usize> = HashMap::new();
+  let mut dupes = 0;
+  for (i, c) in comments.iter().enumerate() {
+    let k = (
+      c["id"].as_str().unwrap_or("").into(),
+      c["text"].as_str().unwrap_or("").into(),
+      c["timestamp"].as_i64().unwrap_or(0),
+    );
+    
+    if let Some(&first) = seen.get(&k) {
+      dupes += 1;
+      println!(
+        "  Index {} dupes index {} | id={} | ts={} | text={:.80}",
+        i, first, k.0, k.2, k.1
+      );
+    } else {
+      seen.insert(k, i);
+    }
+  }
+  println!(
+    "{}",
+    if dupes > 0 {
+      format!("Found {} duplicate(s)", dupes)
+    } else {
+      format!("No duplicates among {} comments.", comments.len())
+    }
+  );
+}
--- a/scripts/extract_comments.py
+++ b/scripts/extract_comments.py
@@ -0,0 +1,30 @@
+import json
+import os
+from yt_dlp import YoutubeDL
+
+url = "https://www.youtube.com/watch?v=lW7vWfRI5oI"
+original_path = "docs/comments_ORIGINAL.json" # the original, authentic comments from before cutoff (in case they get edited/deleted one day)
+output_path = "docs/comments.json"
+CUTOFF = 1767247567  # Jan 1 2026 UTC at SIX SEVEN
+
+# Load the frozen original comments (pre-cutoff, never modified)
+original = json.load(open(original_path, encoding="utf-8")) if os.path.exists(original_path) else []
+# ^^ this is a LONG ternary btw
+
+with YoutubeDL({"skip_download": True, "getcomments": True}) as ydl:
+    scraped = ydl.extract_info(url, download=False).get("comments", [])
+
+new_by_id = {}
+for c in scraped:
+    if c.get("timestamp", 0) >= CUTOFF and c.get("id"):
+        new_by_id[c["id"]] = c
+
+# we'll grab comments that are after the cutoff, those are what will be appended
+
+merged = original + sorted(new_by_id.values(), key=lambda x: x.get("timestamp", 0))
+
+with open(output_path, "w", encoding="utf-8") as f:
+    json.dump(merged, f, indent=2, ensure_ascii=False)
+
+
+print(f"Done\n{len(original)} original comments + {len(new_by_id)} new comments = {len(merged)} total")