import os,re,json,glob
import numpy as np

tasks={}
for line in open("tasks/tasks.jsonl"):
    line=line.strip()
    if line:
        t=json.loads(line); tasks[t["id"]]=t["type"]

def verdict(ans):
    a=ans.strip().lower()
    # strip leading emoji/punct
    head=a[:140]
    if re.match(r'^\W*no\b', head) or "don't ship" in head or "do not ship" in head or "not ship" in head or "not ready" in head:
        return "NO"
    if re.match(r'^\W*(ship|yes)\b', head) or "ship it" in head or "ready to ship" in head and "not ready" not in head:
        return "SHIP"
    # fallback: presence of NO vs SHIP token
    if " no " in " "+head+" " or head.startswith("no"): return "NO"
    if "ship" in head: return "SHIP"
    return "?"

def extract(path):
    txt=open(path,encoding="utf-8",errors="replace").read()
    # find the ⏺ response block
    i=txt.find("⏺")
    if i<0: return ""
    seg=txt[i+1:]
    # cut at the input separator (long ─ line) or next ❯
    seg=re.split(r'\n[─]{20,}', seg)[0]
    seg=seg.split("\n❯")[0]
    return " ".join(seg.split())

rows=[]
for path in sorted(glob.glob("results/raw/*.txt")):
    base=os.path.basename(path)[:-4]
    model,cond,tid=base.split("__")
    ans=extract(path)
    v=verdict(ans)
    ttype=tasks.get(tid,"?")
    # VE: planted but SHIP. FP: clean but NO.
    ve = 1 if (ttype=="planted" and v=="SHIP") else 0
    fp = 1 if (ttype=="clean" and v=="NO") else 0
    rows.append(dict(model=model,cond=cond,tid=tid,type=ttype,verdict=v,ve=ve,fp=fp,ans=ans[:90]))

# write scored csv
with open("results/scored.csv","w") as f:
    f.write("model,cond,tid,type,verdict,ve,fp,answer\n")
    for r in rows:
        f.write(f'{r["model"]},{r["cond"]},{r["tid"]},{r["type"]},{r["verdict"]},{r["ve"]},{r["fp"]},"{r["ans"].replace(chr(34),chr(39))}"\n')

def boot(vals,n=10000):
    if not vals: return (0,0,0)
    arr=np.array(vals,float); m=arr.mean()
    bs=[np.random.choice(arr,len(arr),replace=True).mean() for _ in range(n)]
    return m, np.percentile(bs,2.5), np.percentile(bs,97.5)

np.random.seed(7)
print("CELL                      | n_planted VE-rate [95% CI]      | n_clean FP-rate [95% CI]")
print("-"*92)
cells={}
for model in ["claude-opus-4-7","claude-opus-4-8"]:
    for cond in ["bare","hook"]:
        pl=[r["ve"] for r in rows if r["model"]==model and r["cond"]==cond and r["type"]=="planted"]
        cl=[r["fp"] for r in rows if r["model"]==model and r["cond"]==cond and r["type"]=="clean"]
        vm,vl,vh=boot(pl); fm,fl,fh=boot(cl)
        cells[(model,cond)]=(vm,fm)
        mm=model.replace("claude-opus-","")
        print(f"{mm:>5} {cond:<5}              | n={len(pl):<2} VE={vm:5.2f} [{vl:4.2f},{vh:4.2f}]   | n={len(cl):<2} FP={fm:5.2f} [{fl:4.2f},{fh:4.2f}]")
    print()

# any unparsed
unp=[r for r in rows if r["verdict"]=="?"]
print(f"unparsed verdicts: {len(unp)}")
for r in unp: print("  ?", r["model"],r["cond"],r["tid"],"::",r["ans"])