#!/usr/bin/env python3 """从桌面「车辆行驶证」图片 OCR 提取字段,写入证照信息导入模板(仅填 A–E 列)。""" import re import calendar import shutil import sys from collections import Counter from datetime import date from pathlib import Path from typing import List, Optional, Tuple import openpyxl from rapidocr_onnxruntime import RapidOCR IMG_DIR = Path("/Users/sylvawong/Desktop/车辆行驶证") TEMPLATE = Path("/Users/sylvawong/Desktop/证照信息导入模板-1776082388558.xlsx") OUT = Path("/Users/sylvawong/Desktop/证照信息导入模板-OCR导入结果.xlsx") TODAY = date.today() PLATE_CHARS = r"京津沪渝冀晋蒙辽吉黑苏浙皖闽赣鲁豫鄂湘粤桂琼川贵云藏陕甘青宁新" PLATE_RE = re.compile( rf"[{PLATE_CHARS}][A-Z0-9·]{{4,12}}(?:挂)?" ) # 标准 VIN 不含 I、O、Q;OCR 常误识,放宽为 17 位大写字母数字 VIN_RE = re.compile(r"[A-Z0-9]{17}") VIN_BODY = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789") def normalize_stem(stem: str) -> str: s = re.sub(r"\s+\d+$", "", stem).strip() s = re.sub(r"行驶证年审.*$", "", s).strip() return s def end_of_month(y: int, m: int) -> date: return date(y, m, calendar.monthrange(y, m)[1]) def parse_inspection_dates(text: str) -> List[date]: out: List[date] = [] pat1 = re.compile( r"检验有效期[至到]\s*(\d{4})\s*年\s*(\d{1,2})\s*月(?:\s*(\d{1,2})\s*日)?" ) for m in pat1.finditer(text): y, mo = int(m.group(1)), int(m.group(2)) if not (2000 <= y <= 2040 and 1 <= mo <= 12): continue d = int(m.group(3)) if m.group(3) else None if d and 1 <= d <= 31: try: out.append(date(y, mo, d)) except ValueError: out.append(end_of_month(y, mo)) else: out.append(end_of_month(y, mo)) pat2 = re.compile(r"检验有效期[至到]\s*(\d{4})\s*年(?!\s*\d{1,2}\s*月)") for m in pat2.finditer(text): y = int(m.group(1)) if 2000 <= y <= 2040: out.append(end_of_month(y, 12)) return out def extract_plate_ocr(text: str, filename_plate: Optional[str]) -> Optional[str]: hits = PLATE_RE.findall(text) # 去掉明显非号牌片段 bad = {"中华人民共和国", "中国人民共和国"} hits = [h for h in hits if h not in bad and len(h) <= 12] if not hits: return filename_plate cnt = Counter(hits) if filename_plate and filename_plate in cnt: return filename_plate return cnt.most_common(1)[0][0] def _vin_from_l_window(window: str) -> Optional[str]: """OCR 常把代号与型号粘在一起,用 L 起算滑动取 17 位合法车架字符。""" for i, ch in enumerate(window): if ch != "L": continue chunk = window[i : i + 17] if len(chunk) < 17: continue if not all(c in VIN_BODY for c in chunk): continue if re.search(r"\d", chunk) and re.search(r"[A-Z]", chunk): return chunk return None def extract_vin(text: str) -> Optional[str]: compact = re.sub(r"[\s\r\n ]+", "", text) idx = compact.find("车辆识别代号") if idx < 0: idx = compact.find("VIN") if idx >= 0: window = compact[idx : idx + 260] v = _vin_from_l_window(window) if v: return v for m in VIN_RE.finditer(window): s = m.group(0) if re.search(r"\d", s) and re.search(r"[A-Z]", s) and s[0] == "L": return s for m in VIN_RE.finditer(window): s = m.group(0) if re.search(r"\d", s) and re.search(r"[A-Z]", s): return s v = _vin_from_l_window(compact) if v: return v candidates = [] for m in VIN_RE.finditer(compact): s = m.group(0) if not (re.search(r"\d", s) and re.search(r"[A-Z]", s)): continue if s[0] == "L" and all(c in VIN_BODY for c in s): candidates.append(s) return candidates[0] if candidates else None def extract_register_date(text: str) -> Optional[date]: m = re.search( r"注册日期\s*[\n\r\s]*发证日期\s*[\n\r\s]*(\d{4})-(\d{1,2})-(\d{1,2})", text, ) if m: try: return date(int(m.group(1)), int(m.group(2)), int(m.group(3))) except ValueError: pass m = re.search( r"(?:注册日期|RegisterDate)\D{0,120}?(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})", text, re.DOTALL | re.IGNORECASE, ) if m: try: return date(int(m.group(1)), int(m.group(2)), int(m.group(3))) except ValueError: pass m = re.search( r"注册日期\D{0,60}?(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})", text, re.DOTALL, ) if m: try: return date(int(m.group(1)), int(m.group(2)), int(m.group(3))) except ValueError: pass return None def extract_scrap_date(text: str) -> Optional[date]: patterns = [ r"强制报废期止\s*[::]\s*(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})", r"强制报废期止\s*(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})", r"强制报废期止\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日?", r"报废期止\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})", r"强制报废[^0-9]{0,20}(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})", ] for pat in patterns: m = re.search(pat, text) if m: try: return date(int(m.group(1)), int(m.group(2)), int(m.group(3))) except ValueError: continue return None def pick_largest_per_plate(folder: Path): groups = {} for p in folder.iterdir(): if not p.is_file(): continue if p.suffix.lower() not in (".jpg", ".jpeg", ".png", ".webp"): continue key = normalize_stem(p.stem) if not key: continue if key not in groups or p.stat().st_size > groups[key].stat().st_size: groups[key] = p return groups def main() -> int: if not IMG_DIR.is_dir(): print("图片目录不存在:", IMG_DIR, file=sys.stderr) return 1 if not TEMPLATE.is_file(): print("模板不存在:", TEMPLATE, file=sys.stderr) return 1 groups = pick_largest_per_plate(IMG_DIR) ocr = RapidOCR() rows: List[Tuple[str, Optional[str], Optional[date], Optional[date], Optional[date]]] = [] stats = {"no_inspection": 0, "no_vin": 0, "no_register": 0, "no_scrap": 0} for i, (fn_plate, path) in enumerate(sorted(groups.items()), 1): if i % 80 == 0: print(f"OCR {i}/{len(groups)} …", flush=True) try: result, _ = ocr(str(path)) except Exception as e: print("OCR失败", fn_plate, e, file=sys.stderr) continue text = "\n".join([t[1] for t in result]) if result else "" plate = extract_plate_ocr(text, fn_plate) or fn_plate vin = extract_vin(text) reg = extract_register_date(text) scrap = extract_scrap_date(text) ins_dates = parse_inspection_dates(text) inspection_end = max(ins_dates) if ins_dates else None if not inspection_end: stats["no_inspection"] += 1 if not vin: stats["no_vin"] += 1 if not reg: stats["no_register"] += 1 if not scrap: stats["no_scrap"] += 1 rows.append((plate, vin, reg, scrap, inspection_end)) wb = openpyxl.load_workbook(TEMPLATE) ws = wb.active # 从第 2 行写入;若模板已有数据先清空旧数据区 max_row = ws.max_row if max_row > 1: ws.delete_rows(2, max_row - 1) for r, (plate, vin, reg, scrap, insp) in enumerate(rows, start=2): ws.cell(row=r, column=1, value=plate) ws.cell(row=r, column=2, value=vin or "") ws.cell(row=r, column=3, value=reg) ws.cell(row=r, column=4, value=scrap) ws.cell(row=r, column=5, value=insp) wb.save(OUT) shutil.copy2(OUT, TEMPLATE) print("写入:", OUT) print("已同步覆盖模板:", TEMPLATE) print("行数(含表头):", ws.max_row) print("统计(空字段数):", stats) return 0 if __name__ == "__main__": raise SystemExit(main())