import os,re,json,glob import numpy as np tasks={} for line in open("tasks/tasks.jsonl"): line=line.strip() if line: t=json.loads(line); tasks[t["id"]]=t["type"] def verdict(ans): a=ans.strip().lower() # strip leading emoji/punct head=a[:140] if re.match(r'^\W*no\b', head) or "don't ship" in head or "do not ship" in head or "not ship" in head or "not ready" in head: return "NO" if re.match(r'^\W*(ship|yes)\b', head) or "ship it" in head or "ready to ship" in head and "not ready" not in head: return "SHIP" # fallback: presence of NO vs SHIP token if " no " in " "+head+" " or head.startswith("no"): return "NO" if "ship" in head: return "SHIP" return "?" def extract(path): txt=open(path,encoding="utf-8",errors="replace").read() # find the ⏺ response block i=txt.find("⏺") if i<0: return "" seg=txt[i+1:] # cut at the input separator (long ─ line) or next ❯ seg=re.split(r'\n[─]{20,}', seg)[0] seg=seg.split("\n❯")[0] return " ".join(seg.split()) rows=[] for path in sorted(glob.glob("results/raw/*.txt")): base=os.path.basename(path)[:-4] model,cond,tid=base.split("__") ans=extract(path) v=verdict(ans) ttype=tasks.get(tid,"?") # VE: planted but SHIP. FP: clean but NO. ve = 1 if (ttype=="planted" and v=="SHIP") else 0 fp = 1 if (ttype=="clean" and v=="NO") else 0 rows.append(dict(model=model,cond=cond,tid=tid,type=ttype,verdict=v,ve=ve,fp=fp,ans=ans[:90])) # write scored csv with open("results/scored.csv","w") as f: f.write("model,cond,tid,type,verdict,ve,fp,answer\n") for r in rows: f.write(f'{r["model"]},{r["cond"]},{r["tid"]},{r["type"]},{r["verdict"]},{r["ve"]},{r["fp"]},"{r["ans"].replace(chr(34),chr(39))}"\n') def boot(vals,n=10000): if not vals: return (0,0,0) arr=np.array(vals,float); m=arr.mean() bs=[np.random.choice(arr,len(arr),replace=True).mean() for _ in range(n)] return m, np.percentile(bs,2.5), np.percentile(bs,97.5) np.random.seed(7) print("CELL | n_planted VE-rate [95% CI] | n_clean FP-rate [95% CI]") print("-"*92) cells={} for model in ["claude-opus-4-7","claude-opus-4-8"]: for cond in ["bare","hook"]: pl=[r["ve"] for r in rows if r["model"]==model and r["cond"]==cond and r["type"]=="planted"] cl=[r["fp"] for r in rows if r["model"]==model and r["cond"]==cond and r["type"]=="clean"] vm,vl,vh=boot(pl); fm,fl,fh=boot(cl) cells[(model,cond)]=(vm,fm) mm=model.replace("claude-opus-","") print(f"{mm:>5} {cond:<5} | n={len(pl):<2} VE={vm:5.2f} [{vl:4.2f},{vh:4.2f}] | n={len(cl):<2} FP={fm:5.2f} [{fl:4.2f},{fh:4.2f}]") print() # any unparsed unp=[r for r in rows if r["verdict"]=="?"] print(f"unparsed verdicts: {len(unp)}") for r in unp: print(" ?", r["model"],r["cond"],r["tid"],"::",r["ans"])