Optimized the root .gitignore to exclude virtual environments, node modules, and temp folders to ensure clean and lightweight version tracking. Co-authored-by: Cursor <cursoragent@cursor.com>
256 lines
8.2 KiB
Python
256 lines
8.2 KiB
Python
#!/usr/bin/env python3
|
||
"""从桌面「车辆行驶证」图片 OCR 提取字段,写入证照信息导入模板(仅填 A–E 列)。"""
|
||
import re
|
||
import calendar
|
||
import shutil
|
||
import sys
|
||
from collections import Counter
|
||
from datetime import date
|
||
from pathlib import Path
|
||
from typing import List, Optional, Tuple
|
||
|
||
import openpyxl
|
||
from rapidocr_onnxruntime import RapidOCR
|
||
|
||
IMG_DIR = Path("/Users/sylvawong/Desktop/车辆行驶证")
|
||
TEMPLATE = Path("/Users/sylvawong/Desktop/证照信息导入模板-1776082388558.xlsx")
|
||
OUT = Path("/Users/sylvawong/Desktop/证照信息导入模板-OCR导入结果.xlsx")
|
||
TODAY = date.today()
|
||
|
||
PLATE_CHARS = r"京津沪渝冀晋蒙辽吉黑苏浙皖闽赣鲁豫鄂湘粤桂琼川贵云藏陕甘青宁新"
|
||
PLATE_RE = re.compile(
|
||
rf"[{PLATE_CHARS}][A-Z0-9·]{{4,12}}(?:挂)?"
|
||
)
|
||
# 标准 VIN 不含 I、O、Q;OCR 常误识,放宽为 17 位大写字母数字
|
||
VIN_RE = re.compile(r"[A-Z0-9]{17}")
|
||
VIN_BODY = set("ABCDEFGHJKLMNPRSTUVWXYZ0123456789")
|
||
|
||
|
||
def normalize_stem(stem: str) -> str:
|
||
s = re.sub(r"\s+\d+$", "", stem).strip()
|
||
s = re.sub(r"行驶证年审.*$", "", s).strip()
|
||
return s
|
||
|
||
|
||
def end_of_month(y: int, m: int) -> date:
|
||
return date(y, m, calendar.monthrange(y, m)[1])
|
||
|
||
|
||
def parse_inspection_dates(text: str) -> List[date]:
|
||
out: List[date] = []
|
||
pat1 = re.compile(
|
||
r"检验有效期[至到]\s*(\d{4})\s*年\s*(\d{1,2})\s*月(?:\s*(\d{1,2})\s*日)?"
|
||
)
|
||
for m in pat1.finditer(text):
|
||
y, mo = int(m.group(1)), int(m.group(2))
|
||
if not (2000 <= y <= 2040 and 1 <= mo <= 12):
|
||
continue
|
||
d = int(m.group(3)) if m.group(3) else None
|
||
if d and 1 <= d <= 31:
|
||
try:
|
||
out.append(date(y, mo, d))
|
||
except ValueError:
|
||
out.append(end_of_month(y, mo))
|
||
else:
|
||
out.append(end_of_month(y, mo))
|
||
pat2 = re.compile(r"检验有效期[至到]\s*(\d{4})\s*年(?!\s*\d{1,2}\s*月)")
|
||
for m in pat2.finditer(text):
|
||
y = int(m.group(1))
|
||
if 2000 <= y <= 2040:
|
||
out.append(end_of_month(y, 12))
|
||
return out
|
||
|
||
|
||
def extract_plate_ocr(text: str, filename_plate: Optional[str]) -> Optional[str]:
|
||
hits = PLATE_RE.findall(text)
|
||
# 去掉明显非号牌片段
|
||
bad = {"中华人民共和国", "中国人民共和国"}
|
||
hits = [h for h in hits if h not in bad and len(h) <= 12]
|
||
if not hits:
|
||
return filename_plate
|
||
cnt = Counter(hits)
|
||
if filename_plate and filename_plate in cnt:
|
||
return filename_plate
|
||
return cnt.most_common(1)[0][0]
|
||
|
||
|
||
def _vin_from_l_window(window: str) -> Optional[str]:
|
||
"""OCR 常把代号与型号粘在一起,用 L 起算滑动取 17 位合法车架字符。"""
|
||
for i, ch in enumerate(window):
|
||
if ch != "L":
|
||
continue
|
||
chunk = window[i : i + 17]
|
||
if len(chunk) < 17:
|
||
continue
|
||
if not all(c in VIN_BODY for c in chunk):
|
||
continue
|
||
if re.search(r"\d", chunk) and re.search(r"[A-Z]", chunk):
|
||
return chunk
|
||
return None
|
||
|
||
|
||
def extract_vin(text: str) -> Optional[str]:
|
||
compact = re.sub(r"[\s\r\n ]+", "", text)
|
||
idx = compact.find("车辆识别代号")
|
||
if idx < 0:
|
||
idx = compact.find("VIN")
|
||
if idx >= 0:
|
||
window = compact[idx : idx + 260]
|
||
v = _vin_from_l_window(window)
|
||
if v:
|
||
return v
|
||
for m in VIN_RE.finditer(window):
|
||
s = m.group(0)
|
||
if re.search(r"\d", s) and re.search(r"[A-Z]", s) and s[0] == "L":
|
||
return s
|
||
for m in VIN_RE.finditer(window):
|
||
s = m.group(0)
|
||
if re.search(r"\d", s) and re.search(r"[A-Z]", s):
|
||
return s
|
||
v = _vin_from_l_window(compact)
|
||
if v:
|
||
return v
|
||
candidates = []
|
||
for m in VIN_RE.finditer(compact):
|
||
s = m.group(0)
|
||
if not (re.search(r"\d", s) and re.search(r"[A-Z]", s)):
|
||
continue
|
||
if s[0] == "L" and all(c in VIN_BODY for c in s):
|
||
candidates.append(s)
|
||
return candidates[0] if candidates else None
|
||
|
||
|
||
def extract_register_date(text: str) -> Optional[date]:
|
||
m = re.search(
|
||
r"注册日期\s*[\n\r\s]*发证日期\s*[\n\r\s]*(\d{4})-(\d{1,2})-(\d{1,2})",
|
||
text,
|
||
)
|
||
if m:
|
||
try:
|
||
return date(int(m.group(1)), int(m.group(2)), int(m.group(3)))
|
||
except ValueError:
|
||
pass
|
||
m = re.search(
|
||
r"(?:注册日期|RegisterDate)\D{0,120}?(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})",
|
||
text,
|
||
re.DOTALL | re.IGNORECASE,
|
||
)
|
||
if m:
|
||
try:
|
||
return date(int(m.group(1)), int(m.group(2)), int(m.group(3)))
|
||
except ValueError:
|
||
pass
|
||
m = re.search(
|
||
r"注册日期\D{0,60}?(\d{4})[-/.年](\d{1,2})[-/.月](\d{1,2})",
|
||
text,
|
||
re.DOTALL,
|
||
)
|
||
if m:
|
||
try:
|
||
return date(int(m.group(1)), int(m.group(2)), int(m.group(3)))
|
||
except ValueError:
|
||
pass
|
||
return None
|
||
|
||
|
||
def extract_scrap_date(text: str) -> Optional[date]:
|
||
patterns = [
|
||
r"强制报废期止\s*[::]\s*(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})",
|
||
r"强制报废期止\s*(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})",
|
||
r"强制报废期止\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日?",
|
||
r"报废期止\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})",
|
||
r"强制报废[^0-9]{0,20}(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})",
|
||
]
|
||
for pat in patterns:
|
||
m = re.search(pat, text)
|
||
if m:
|
||
try:
|
||
return date(int(m.group(1)), int(m.group(2)), int(m.group(3)))
|
||
except ValueError:
|
||
continue
|
||
return None
|
||
|
||
|
||
def pick_largest_per_plate(folder: Path):
|
||
groups = {}
|
||
for p in folder.iterdir():
|
||
if not p.is_file():
|
||
continue
|
||
if p.suffix.lower() not in (".jpg", ".jpeg", ".png", ".webp"):
|
||
continue
|
||
key = normalize_stem(p.stem)
|
||
if not key:
|
||
continue
|
||
if key not in groups or p.stat().st_size > groups[key].stat().st_size:
|
||
groups[key] = p
|
||
return groups
|
||
|
||
|
||
def main() -> int:
|
||
if not IMG_DIR.is_dir():
|
||
print("图片目录不存在:", IMG_DIR, file=sys.stderr)
|
||
return 1
|
||
if not TEMPLATE.is_file():
|
||
print("模板不存在:", TEMPLATE, file=sys.stderr)
|
||
return 1
|
||
|
||
groups = pick_largest_per_plate(IMG_DIR)
|
||
ocr = RapidOCR()
|
||
|
||
rows: List[Tuple[str, Optional[str], Optional[date], Optional[date], Optional[date]]] = []
|
||
stats = {"no_inspection": 0, "no_vin": 0, "no_register": 0, "no_scrap": 0}
|
||
|
||
for i, (fn_plate, path) in enumerate(sorted(groups.items()), 1):
|
||
if i % 80 == 0:
|
||
print(f"OCR {i}/{len(groups)} …", flush=True)
|
||
try:
|
||
result, _ = ocr(str(path))
|
||
except Exception as e:
|
||
print("OCR失败", fn_plate, e, file=sys.stderr)
|
||
continue
|
||
text = "\n".join([t[1] for t in result]) if result else ""
|
||
|
||
plate = extract_plate_ocr(text, fn_plate) or fn_plate
|
||
vin = extract_vin(text)
|
||
reg = extract_register_date(text)
|
||
scrap = extract_scrap_date(text)
|
||
ins_dates = parse_inspection_dates(text)
|
||
inspection_end = max(ins_dates) if ins_dates else None
|
||
|
||
if not inspection_end:
|
||
stats["no_inspection"] += 1
|
||
if not vin:
|
||
stats["no_vin"] += 1
|
||
if not reg:
|
||
stats["no_register"] += 1
|
||
if not scrap:
|
||
stats["no_scrap"] += 1
|
||
|
||
rows.append((plate, vin, reg, scrap, inspection_end))
|
||
|
||
wb = openpyxl.load_workbook(TEMPLATE)
|
||
ws = wb.active
|
||
# 从第 2 行写入;若模板已有数据先清空旧数据区
|
||
max_row = ws.max_row
|
||
if max_row > 1:
|
||
ws.delete_rows(2, max_row - 1)
|
||
|
||
for r, (plate, vin, reg, scrap, insp) in enumerate(rows, start=2):
|
||
ws.cell(row=r, column=1, value=plate)
|
||
ws.cell(row=r, column=2, value=vin or "")
|
||
ws.cell(row=r, column=3, value=reg)
|
||
ws.cell(row=r, column=4, value=scrap)
|
||
ws.cell(row=r, column=5, value=insp)
|
||
|
||
wb.save(OUT)
|
||
shutil.copy2(OUT, TEMPLATE)
|
||
print("写入:", OUT)
|
||
print("已同步覆盖模板:", TEMPLATE)
|
||
print("行数(含表头):", ws.max_row)
|
||
print("统计(空字段数):", stats)
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|