Files
ONE-OS/scripts/fill_license_excel_from_images.py
王冕 a27e3b8e43 feat: sync full workspace including web modules, docs, and configurations to Gitea
Optimized the root .gitignore to exclude virtual environments, node modules,
and temp folders to ensure clean and lightweight version tracking.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-09 18:12:25 +08:00

256 lines
8.2 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""从桌面「车辆行驶证」图片 OCR 提取字段,写入证照信息导入模板(仅填 AE 列)。"""
import re
import calendar
import shutil
import sys
from collections import Counter
from datetime import date
from pathlib import Path
from typing import List, Optional, Tuple
import openpyxl
from rapidocr_onnxruntime import RapidOCR
IMG_DIR = Path("/Users/sylvawong/Desktop/车辆行驶证")
TEMPLATE = Path("/Users/sylvawong/Desktop/证照信息导入模板-1776082388558.xlsx")
OUT = Path("/Users/sylvawong/Desktop/证照信息导入模板-OCR导入结果.xlsx")
TODAY = date.today()
PLATE_CHARS = r"京津沪渝冀晋蒙辽吉黑苏浙皖闽赣鲁豫鄂湘粤桂琼川贵云藏陕甘青宁新"
PLATE_RE = re.compile(
rf"[{PLATE_CHARS}][A-Z0-9·]{{4,12}}(?:挂)?"
)
# 标准 VIN 不含 I、O、QOCR 常误识,放宽为 17 位大写字母数字
VIN_RE = re.compile(r"[A-Z0-9]{17}")
VIN_BODY = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
def normalize_stem(stem: str) -> str:
s = re.sub(r"\s+\d+$", "", stem).strip()
s = re.sub(r"行驶证年审.*$", "", s).strip()
return s
def end_of_month(y: int, m: int) -> date:
return date(y, m, calendar.monthrange(y, m)[1])
def parse_inspection_dates(text: str) -> List[date]:
out: List[date] = []
pat1 = re.compile(
r"检验有效期[至到]\s*(\d{4})\s*年\s*(\d{1,2})\s*月(?:\s*(\d{1,2})\s*日)?"
)
for m in pat1.finditer(text):
y, mo = int(m.group(1)), int(m.group(2))
if not (2000 <= y <= 2040 and 1 <= mo <= 12):
continue
d = int(m.group(3)) if m.group(3) else None
if d and 1 <= d <= 31:
try:
out.append(date(y, mo, d))
except ValueError:
out.append(end_of_month(y, mo))
else:
out.append(end_of_month(y, mo))
pat2 = re.compile(r"检验有效期[至到]\s*(\d{4})\s*年(?!\s*\d{1,2}\s*月)")
for m in pat2.finditer(text):
y = int(m.group(1))
if 2000 <= y <= 2040:
out.append(end_of_month(y, 12))
return out
def extract_plate_ocr(text: str, filename_plate: Optional[str]) -> Optional[str]:
hits = PLATE_RE.findall(text)
# 去掉明显非号牌片段
bad = {"中华人民共和国", "中国人民共和国"}
hits = [h for h in hits if h not in bad and len(h) <= 12]
if not hits:
return filename_plate
cnt = Counter(hits)
if filename_plate and filename_plate in cnt:
return filename_plate
return cnt.most_common(1)[0][0]
def _vin_from_l_window(window: str) -> Optional[str]:
"""OCR 常把代号与型号粘在一起,用 L 起算滑动取 17 位合法车架字符。"""
for i, ch in enumerate(window):
if ch != "L":
continue
chunk = window[i : i + 17]
if len(chunk) < 17:
continue
if not all(c in VIN_BODY for c in chunk):
continue
if re.search(r"\d", chunk) and re.search(r"[A-Z]", chunk):
return chunk
return None
def extract_vin(text: str) -> Optional[str]:
compact = re.sub(r"[\s\r\n ]+", "", text)
idx = compact.find("车辆识别代号")
if idx < 0:
idx = compact.find("VIN")
if idx >= 0:
window = compact[idx : idx + 260]
v = _vin_from_l_window(window)
if v:
return v
for m in VIN_RE.finditer(window):
s = m.group(0)
if re.search(r"\d", s) and re.search(r"[A-Z]", s) and s[0] == "L":
return s
for m in VIN_RE.finditer(window):
s = m.group(0)
if re.search(r"\d", s) and re.search(r"[A-Z]", s):
return s
v = _vin_from_l_window(compact)
if v:
return v
candidates = []
for m in VIN_RE.finditer(compact):
s = m.group(0)
if not (re.search(r"\d", s) and re.search(r"[A-Z]", s)):
continue
if s[0] == "L" and all(c in VIN_BODY for c in s):
candidates.append(s)
return candidates[0] if candidates else None
def extract_register_date(text: str) -> Optional[date]:
m = re.search(
r"注册日期\s*[\n\r\s]*发证日期\s*[\n\r\s]*(\d{4})-(\d{1,2})-(\d{1,2})",
text,
)
if m:
try:
return date(int(m.group(1)), int(m.group(2)), int(m.group(3)))
except ValueError:
pass
m = re.search(
r"(?:注册日期|RegisterDate)\D{0,120}?(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})",
text,
re.DOTALL | re.IGNORECASE,
)
if m:
try:
return date(int(m.group(1)), int(m.group(2)), int(m.group(3)))
except ValueError:
pass
m = re.search(
r"注册日期\D{0,60}?(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})",
text,
re.DOTALL,
)
if m:
try:
return date(int(m.group(1)), int(m.group(2)), int(m.group(3)))
except ValueError:
pass
return None
def extract_scrap_date(text: str) -> Optional[date]:
patterns = [
r"强制报废期止\s*[:]\s*(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})",
r"强制报废期止\s*(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})",
r"强制报废期止\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日?",
r"报废期止\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})",
r"强制报废[^0-9]{0,20}(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})",
]
for pat in patterns:
m = re.search(pat, text)
if m:
try:
return date(int(m.group(1)), int(m.group(2)), int(m.group(3)))
except ValueError:
continue
return None
def pick_largest_per_plate(folder: Path):
groups = {}
for p in folder.iterdir():
if not p.is_file():
continue
if p.suffix.lower() not in (".jpg", ".jpeg", ".png", ".webp"):
continue
key = normalize_stem(p.stem)
if not key:
continue
if key not in groups or p.stat().st_size > groups[key].stat().st_size:
groups[key] = p
return groups
def main() -> int:
if not IMG_DIR.is_dir():
print("图片目录不存在:", IMG_DIR, file=sys.stderr)
return 1
if not TEMPLATE.is_file():
print("模板不存在:", TEMPLATE, file=sys.stderr)
return 1
groups = pick_largest_per_plate(IMG_DIR)
ocr = RapidOCR()
rows: List[Tuple[str, Optional[str], Optional[date], Optional[date], Optional[date]]] = []
stats = {"no_inspection": 0, "no_vin": 0, "no_register": 0, "no_scrap": 0}
for i, (fn_plate, path) in enumerate(sorted(groups.items()), 1):
if i % 80 == 0:
print(f"OCR {i}/{len(groups)}", flush=True)
try:
result, _ = ocr(str(path))
except Exception as e:
print("OCR失败", fn_plate, e, file=sys.stderr)
continue
text = "\n".join([t[1] for t in result]) if result else ""
plate = extract_plate_ocr(text, fn_plate) or fn_plate
vin = extract_vin(text)
reg = extract_register_date(text)
scrap = extract_scrap_date(text)
ins_dates = parse_inspection_dates(text)
inspection_end = max(ins_dates) if ins_dates else None
if not inspection_end:
stats["no_inspection"] += 1
if not vin:
stats["no_vin"] += 1
if not reg:
stats["no_register"] += 1
if not scrap:
stats["no_scrap"] += 1
rows.append((plate, vin, reg, scrap, inspection_end))
wb = openpyxl.load_workbook(TEMPLATE)
ws = wb.active
# 从第 2 行写入;若模板已有数据先清空旧数据区
max_row = ws.max_row
if max_row > 1:
ws.delete_rows(2, max_row - 1)
for r, (plate, vin, reg, scrap, insp) in enumerate(rows, start=2):
ws.cell(row=r, column=1, value=plate)
ws.cell(row=r, column=2, value=vin or "")
ws.cell(row=r, column=3, value=reg)
ws.cell(row=r, column=4, value=scrap)
ws.cell(row=r, column=5, value=insp)
wb.save(OUT)
shutil.copy2(OUT, TEMPLATE)
print("写入:", OUT)
print("已同步覆盖模板:", TEMPLATE)
print("行数(含表头):", ws.max_row)
print("统计(空字段数):", stats)
return 0
if __name__ == "__main__":
raise SystemExit(main())