added a script to check for dupe comments
This commit is contained in:
14
scripts/check_dupes.py
Normal file
14
scripts/check_dupes.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import json
|
||||
|
||||
comments = json.load(open("docs/comments.json", encoding="utf-8"))
|
||||
seen, dupes = {}, 0
|
||||
|
||||
for i, c in enumerate(comments):
|
||||
k = (c.get("id",""), c.get("text",""), c.get("timestamp",0))
|
||||
if k in seen:
|
||||
dupes += 1;
|
||||
print(f" Index {i} dupes {seen[k]} | id={k[0]} | ts={k[2]} | text={k[1][:80]}")
|
||||
else:
|
||||
seen[k] = i
|
||||
|
||||
print(f"Found {dupes} duplicate(s)" if dupes else f"No duplicates among us.")
|
||||
34
scripts/check_dupes.rs
Normal file
34
scripts/check_dupes.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
use std::{collections::HashMap, fs};
|
||||
use serde_json::Value;
|
||||
|
||||
fn main() {
|
||||
let comments: Vec<Value> =
|
||||
serde_json::from_str(&fs::read_to_string("docs/comments.json").unwrap()).unwrap();
|
||||
let mut seen: HashMap<(String, String, i64), usize> = HashMap::new();
|
||||
let mut dupes = 0;
|
||||
for (i, c) in comments.iter().enumerate() {
|
||||
let k = (
|
||||
c["id"].as_str().unwrap_or("").into(),
|
||||
c["text"].as_str().unwrap_or("").into(),
|
||||
c["timestamp"].as_i64().unwrap_or(0),
|
||||
);
|
||||
|
||||
if let Some(&first) = seen.get(&k) {
|
||||
dupes += 1;
|
||||
println!(
|
||||
" Index {} dupes index {} | id={} | ts={} | text={:.80}",
|
||||
i, first, k.0, k.2, k.1
|
||||
);
|
||||
} else {
|
||||
seen.insert(k, i);
|
||||
}
|
||||
}
|
||||
println!(
|
||||
"{}",
|
||||
if dupes > 0 {
|
||||
format!("Found {} duplicate(s)", dupes)
|
||||
} else {
|
||||
format!("No duplicates among {} comments.", comments.len())
|
||||
}
|
||||
);
|
||||
}
|
||||
30
scripts/extract_comments.py
Normal file
30
scripts/extract_comments.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import json
|
||||
import os
|
||||
from yt_dlp import YoutubeDL
|
||||
|
||||
url = "https://www.youtube.com/watch?v=lW7vWfRI5oI"
|
||||
original_path = "docs/comments_ORIGINAL.json" # the original, authentic comments from before cutoff (in case they get edited/deleted one day)
|
||||
output_path = "docs/comments.json"
|
||||
CUTOFF = 1767247567 # Jan 1 2026 UTC at SIX SEVEN
|
||||
|
||||
# Load the frozen original comments (pre-cutoff, never modified)
|
||||
original = json.load(open(original_path, encoding="utf-8")) if os.path.exists(original_path) else []
|
||||
# ^^ this is a LONG ternary btw
|
||||
|
||||
with YoutubeDL({"skip_download": True, "getcomments": True}) as ydl:
|
||||
scraped = ydl.extract_info(url, download=False).get("comments", [])
|
||||
|
||||
new_by_id = {}
|
||||
for c in scraped:
|
||||
if c.get("timestamp", 0) >= CUTOFF and c.get("id"):
|
||||
new_by_id[c["id"]] = c
|
||||
|
||||
# we'll grab comments that are after the cutoff, those are what will be appended
|
||||
|
||||
merged = original + sorted(new_by_id.values(), key=lambda x: x.get("timestamp", 0))
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(merged, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
print(f"Done\n{len(original)} original comments + {len(new_by_id)} new comments = {len(merged)} total")
|
||||
Reference in New Issue
Block a user