Files
sephiria_inv_program/sephiria_inv/recognizer.py
Claude ee82b161eb Recognizer v0.3.6: HDR-friendly empty + lower threshold + debug dump
User reports all 34 cells classified as 미인식 with score 0.00 even
when the grid was correctly cropped. Multiple compounding issues:

1. _is_empty required mean<60 (dark) AND std<14. HDR/bright captures
   produce pinkish empty slots with mean ~150-180, so even empty cells
   fell through to template matching. Drop the mean check; uniformity
   alone (std<18 grayscale, std<22 per-channel) is the real signal.

2. Score 0.00 across the board strongly suggests templates list was
   empty (only path that returns exactly 0.0). Track per-bucket load
   counts (slabs_ok/fail, artifacts_ok/fail) and surface them in the
   GUI status bar so a CDN failure is immediately visible. Currently
   no signal at all on download failure.

3. min_score 0.55 was tuned against simulator-clean renders. Real game
   captures have decorative cell borders, stack-count badges in
   corners, HDR shader effects. Lower to 0.35 and inset cell crops by
   16% on each side before matching to skip the decorative frame.

4. Add 디버그 저장 button + dump_debug() that saves screenshot.png,
   bbox_crop.png, cells/<row>-<col>.png, and report.txt with top-3
   matches per cell to %LOCALAPPDATA%/sephiria_inv/debug/<timestamp>/.
   Lets us iterate on tuning from real captures without round-tripping
   raw screenshots through chat each time.
2026-05-16 03:18:29 +09:00

319 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Cell-level recognition over the inventory grid.
Pipeline given a cropped inventory image:
1. Slice into 6-col rows per generate_grid_config().
2. Per cell, classify: empty / slab / artifact / unknown.
- "empty" = low std-dev / dark uniform pixels
- "slab" = best NCC match across all slabs × 4 rotations
- "artifact"= best NCC match across all artifacts (no rotation)
- "unknown" = nothing matched above the confidence floor →
likely a merged "?" slab box, surfaced to the user.
NCC (normalized cross-correlation) is used instead of MAE because it's
invariant to brightness/contrast shifts — the in-game render has subtle
shader effects (bloom, vignette) that MAE penalizes harshly.
Templates are fetched via renderer.fetch_slab_image / fetch_artifact_image
on first call and cached on disk.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import numpy as np
from PIL import Image
from .artifacts import ARTIFACTS
from .renderer import fetch_slab_image, fetch_artifact_image
from .slabs import GRID_COLS, SLABS, SLABS_BY_VALUE, generate_grid_config
# ---------- types ----------
@dataclass
class CellResult:
slot_id: str # "<row>-<col>"
row: int
col: int
kind: str # "empty" | "slab" | "artifact" | "unknown"
value: Optional[str] # slab/artifact value, or None
rotation: int # 0/1/2/3 for slabs; 0 otherwise
score: float # NCC in [-1, 1] — higher is better
# ---------- template prep ----------
_TEMPLATE_SIZE = 64 # work at 64x64 — small enough to be fast, big enough to discriminate
def _on_dark(img: Image.Image) -> Image.Image:
"""Composite a possibly-transparent template onto a dark bag-slot color."""
if img.mode != "RGBA":
return img.convert("RGB")
bg = Image.new("RGBA", img.size, (38, 22, 42, 255))
bg.alpha_composite(img)
return bg.convert("RGB")
def _to_feat(img: Image.Image) -> np.ndarray:
"""Resize to fixed size, grayscale, mean-subtract, unit-normalize. Returns 1-D float vector."""
g = img.convert("L").resize((_TEMPLATE_SIZE, _TEMPLATE_SIZE), Image.BILINEAR)
a = np.asarray(g, dtype=np.float32).reshape(-1)
a = a - a.mean()
n = np.linalg.norm(a)
if n < 1e-6:
return a # all zeros — uniform cell
return a / n
@dataclass
class _Template:
kind: str # "slab" | "artifact"
value: str
rotation: int # for slabs
feat: np.ndarray
_TEMPLATE_CACHE: List[_Template] = []
_CACHE_BUILT = False
_LAST_LOAD_STATS: Dict[str, int] = {"slabs_ok": 0, "slabs_fail": 0,
"artifacts_ok": 0, "artifacts_fail": 0}
def _build_templates(*, include_artifacts: bool = True) -> List[_Template]:
"""Build (and cache) the full template list. Lazy because download is slow."""
global _CACHE_BUILT
if _CACHE_BUILT and _TEMPLATE_CACHE:
return _TEMPLATE_CACHE
out: List[_Template] = []
s_ok = s_fail = a_ok = a_fail = 0
# Slabs: 4 rotations for rotatable, 1 otherwise
for s in SLABS:
img = fetch_slab_image(s.image)
if img is None:
s_fail += 1
continue
s_ok += 1
base = _on_dark(img)
rotations = (0, 1, 2, 3) if s.rotate else (0,)
for r in rotations:
rotated = base if r == 0 else base.rotate(-90 * r, expand=False)
out.append(_Template("slab", s.value, r, _to_feat(rotated)))
if include_artifacts:
for a in ARTIFACTS:
img = fetch_artifact_image(a.image)
if img is None:
a_fail += 1
continue
a_ok += 1
base = _on_dark(img)
out.append(_Template("artifact", a.value, 0, _to_feat(base)))
_LAST_LOAD_STATS.update({"slabs_ok": s_ok, "slabs_fail": s_fail,
"artifacts_ok": a_ok, "artifacts_fail": a_fail})
_TEMPLATE_CACHE.clear()
_TEMPLATE_CACHE.extend(out)
_CACHE_BUILT = True
return _TEMPLATE_CACHE
def warm_templates(*, include_artifacts: bool = True) -> int:
"""Force-download all icons. Returns total template count.
Call once from GUI before recognition to avoid stalls per cell.
"""
return len(_build_templates(include_artifacts=include_artifacts))
def load_stats() -> Dict[str, int]:
"""Return last template load counts: slabs_ok, slabs_fail, artifacts_ok, artifacts_fail."""
return dict(_LAST_LOAD_STATS)
# ---------- cell classification ----------
def _is_empty(cell: Image.Image) -> bool:
"""Heuristic: empty slots are uniform color (any brightness).
Drops the dark-only assumption so HDR / bright-monitor captures with
pinkish slot backgrounds still detect as empty. Uniformity is the
actual invariant — empty slots have low std-dev whatever the hue.
"""
g = np.asarray(cell.convert("L"), dtype=np.float32)
rgb = np.asarray(cell.convert("RGB"), dtype=np.float32)
chan_std = float(rgb.reshape(-1, 3).std(axis=0).mean())
return bool(g.std() < 18.0 and chan_std < 22.0)
def _inset(cell: Image.Image, ratio: float = 0.16) -> Image.Image:
"""Trim decorative borders / corner badges before template matching.
The in-game slot has chunky frame ornaments and a stack-count badge in
a corner. Templates are clean icons. Cropping ~16% off every side
aligns the comparable inner art and removes the badge area in most
games.
"""
w, h = cell.size
dx = int(w * ratio)
dy = int(h * ratio)
return cell.crop((dx, dy, w - dx, h - dy))
def _classify(
cell: Image.Image,
templates: List[_Template],
*,
min_score: float = 0.35,
) -> Tuple[str, Optional[str], int, float]:
"""Return (kind, value, rotation, score)."""
if _is_empty(cell):
return "empty", None, 0, 1.0
inner = _inset(cell)
feat = _to_feat(inner)
# Stack template features into a matrix for one big dot-product
if not templates:
return "unknown", None, 0, 0.0
M = np.stack([t.feat for t in templates], axis=0) # (N, D)
scores = M @ feat # NCC since both are mean-subtracted unit norm
idx = int(np.argmax(scores))
best = float(scores[idx])
if best < min_score:
return "unknown", None, 0, best
t = templates[idx]
return t.kind, t.value, t.rotation, best
def _classify_with_top(
cell: Image.Image,
templates: List[_Template],
*,
top_k: int = 3,
) -> Tuple[str, Optional[str], int, float, List[Tuple[str, str, int, float]]]:
"""Like _classify but also returns the top-k matches for debug dumps."""
if _is_empty(cell):
return "empty", None, 0, 1.0, []
if not templates:
return "unknown", None, 0, 0.0, []
feat = _to_feat(_inset(cell))
M = np.stack([t.feat for t in templates], axis=0)
scores = M @ feat
order = np.argsort(-scores)[:top_k]
top = [(templates[i].kind, templates[i].value, templates[i].rotation,
float(scores[i])) for i in order]
kind, value, rot, score = _classify(cell, templates)
return kind, value, rot, score, top
# ---------- public API ----------
def recognize_image(
img: Image.Image,
bbox: Tuple[int, int, int, int],
*,
slot_num: int = 34,
include_artifacts: bool = True,
min_score: float = 0.35,
) -> List[CellResult]:
"""Slice img[bbox] into a 6-col grid and classify each cell.
bbox is in source-image pixel coords.
"""
L, T, R, B = bbox
crop = img.crop((L, T, R, B)).convert("RGB")
grid = generate_grid_config(slot_num)
if not grid:
return []
rows = len(grid)
cell_w = (R - L) // GRID_COLS
cell_h = (B - T) // rows
templates = _build_templates(include_artifacts=include_artifacts)
out: List[CellResult] = []
for row in grid:
y = row["rows"]
for x in range(row["cols"]):
cx0 = x * cell_w
cy0 = y * cell_h
cell = crop.crop((cx0, cy0, cx0 + cell_w, cy0 + cell_h))
kind, value, rot, score = _classify(cell, templates, min_score=min_score)
out.append(CellResult(f"{y}-{x}", y, x, kind, value, rot, score))
return out
def dump_debug(
img: Image.Image,
bbox: Tuple[int, int, int, int],
out_dir: str,
*,
slot_num: int = 34,
include_artifacts: bool = True,
) -> str:
"""Save full screenshot, bbox crop, every cell crop and a top-3 match
report to out_dir. Returns the path to the report file. Used to iterate
on recognizer tuning from real captures.
"""
import os
os.makedirs(out_dir, exist_ok=True)
img.save(os.path.join(out_dir, "screenshot.png"))
L, T, R, B = bbox
crop = img.crop((L, T, R, B)).convert("RGB")
crop.save(os.path.join(out_dir, "bbox_crop.png"))
grid = generate_grid_config(slot_num)
if not grid:
return out_dir
rows = len(grid)
cell_w = (R - L) // GRID_COLS
cell_h = (B - T) // rows
templates = _build_templates(include_artifacts=include_artifacts)
stats = load_stats()
lines = [
f"bbox: {bbox}",
f"grid: {len(grid)} rows x {GRID_COLS} cols, slot_num={slot_num}",
f"cell px: {cell_w} x {cell_h}",
f"templates loaded: total={len(templates)} stats={stats}",
"",
]
cells_dir = os.path.join(out_dir, "cells")
os.makedirs(cells_dir, exist_ok=True)
for row in grid:
y = row["rows"]
for x in range(row["cols"]):
cx0 = x * cell_w
cy0 = y * cell_h
cell = crop.crop((cx0, cy0, cx0 + cell_w, cy0 + cell_h))
cell.save(os.path.join(cells_dir, f"{y}-{x}.png"))
kind, value, rot, score, top = _classify_with_top(cell, templates)
top_s = ", ".join(f"{k}:{v}@r{r}={s:.3f}" for k, v, r, s in top)
lines.append(
f" {y}-{x}: kind={kind} value={value} rot={rot} score={score:.3f} | top: {top_s}"
)
report = os.path.join(out_dir, "report.txt")
with open(report, "w", encoding="utf-8") as fh:
fh.write("\n".join(lines))
return report
def recognize_file(
path: str,
bbox: Tuple[int, int, int, int],
*,
slot_num: int = 34,
include_artifacts: bool = True,
min_score: float = 0.55,
) -> List[CellResult]:
img = Image.open(path)
return recognize_image(
img, bbox,
slot_num=slot_num,
include_artifacts=include_artifacts,
min_score=min_score,
)
def slab_values_from(results: List[CellResult]) -> List[str]:
"""Helper: just the slab values, ignoring artifacts/empty/unknown."""
return [r.value for r in results if r.kind == "slab" and r.value]