Files
ONE-OS/scripts/generate_vehicle_license_ocr_report.py
王冕 a27e3b8e43 feat: sync full workspace including web modules, docs, and configurations to Gitea
Optimized the root .gitignore to exclude virtual environments, node modules,
and temp folders to ensure clean and lightweight version tracking.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-09 18:12:25 +08:00

336 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""从行驶证影像 OCR 提取检验日期,与车辆表合并导出 Excel。
有照片的车辆:先对影像做 OCR若未识别到「检验有效期」再用表内「行驶证检验有效期」补全第三列。
"""
from __future__ import annotations
import calendar
import os
import re
import subprocess
import tempfile
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import date
from typing import Dict, List, Optional, Tuple
import fitz # pymupdf
import pandas as pd
from PIL import Image, ImageEnhance
# 配置路径(可按需修改)
EXCEL_IN = "/Users/sylvawong/Downloads/车辆信息-1776270214730.xlsx"
PHOTO_DIR = "/Users/sylvawong/Desktop/证件信息梳理/行驶证"
EXCEL_OUT = "/Users/sylvawong/Desktop/CURSOR/ONE-OS/车辆行驶证_OCR核对.xlsx"
EXCEL_INSPECTION_COL = "行驶证检验有效期"
TESSERACT = "/opt/homebrew/bin/tesseract"
OCR_LANG = "chi_sim+eng"
MAX_IMAGE_SIDE = 2200
MAX_WORKERS = 3
MAX_PDF_PAGES = 3
PDF_ZOOM = 2.5
def extract_plate_from_filename(basename_no_ext: str) -> Optional[str]:
s = basename_no_ext
s = re.sub(r"\s*\(\d+\)\s*$", "", s)
if "行驶证" in s:
s = s.split("行驶证")[0]
s = s.rstrip("-_· ")
if not s:
return None
parts = s.split("-")
first = parts[0]
if (
len(parts) >= 2
and len(parts[1]) == 17
and re.match(r"^[A-HJ-NPR-Z0-9]{17}$", parts[1], re.I)
):
cand = first
else:
cand = first
m = re.match(r"^([\u4e00-\u9fa5][A-Z0-9\u4e00-\u9fa5·]{1,14})$", cand)
if not m:
return None
plate = m.group(1).rstrip("-_·")
if len(plate) < 6:
return None
return plate
def build_plate_files() -> Dict[str, List[str]]:
mapping: Dict[str, List[str]] = {}
for fn in os.listdir(PHOTO_DIR):
path = os.path.join(PHOTO_DIR, fn)
if not os.path.isfile(path):
continue
base, _ = os.path.splitext(fn)
plate = extract_plate_from_filename(base)
if not plate:
continue
mapping.setdefault(plate, []).append(path)
return mapping
def file_try_order(paths: List[str]) -> List[str]:
def score(p: str) -> Tuple[float, str]:
fn = os.path.basename(p).lower()
# 先年审页通常含检验记录再主页PDF 略靠后(需渲染)
if "年审" in fn:
tier = 0.0
elif "行驶证" in fn or "行驶" in fn:
tier = 1.0
else:
tier = 1.5
if fn.endswith(".pdf"):
tier += 0.25
return (tier, fn)
return sorted(paths, key=score)
def load_image_for_ocr(path: str) -> Image.Image:
img = Image.open(path).convert("RGB")
w, h = img.size
if max(w, h) > MAX_IMAGE_SIDE:
s = MAX_IMAGE_SIDE / max(w, h)
img = img.resize((int(w * s), int(h * s)), Image.LANCZOS)
return img
def ocr_image(img: Image.Image) -> str:
gray = img.convert("L")
gray = ImageEnhance.Contrast(gray).enhance(1.35)
fd, tmp = tempfile.mkstemp(suffix=".png")
os.close(fd)
try:
gray.save(tmp, format="PNG")
cmd = [
TESSERACT,
tmp,
"stdout",
"-l",
OCR_LANG,
"--psm",
"6",
"-c",
"preserve_interword_spaces=1",
]
r = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=180,
)
if r.returncode != 0:
return ""
return r.stdout or ""
finally:
try:
os.unlink(tmp)
except OSError:
pass
def ocr_file(path: str) -> str:
try:
img = load_image_for_ocr(path)
return ocr_image(img)
except Exception:
return ""
def month_end(y: int, m: int) -> date:
last = calendar.monthrange(y, m)[1]
return date(y, m, last)
def parse_inspection_date(text: str) -> Optional[date]:
if not text:
return None
t = text.replace("O", "0").replace("o", "0")
# 检验有效期至 2027年06月04日 / 2026-03-31
patterns = [
r"检验有效期至\s*[:]?\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日",
r"检验有效期至\s*[:]?\s*(\d{4})年(\d{1,2})月(\d{1,2})日",
r"检验有效期至\s*[:]?\s*(\d{4})\s*[-/.]\s*(\d{1,2})\s*[-/.]\s*(\d{1,2})",
r"检验有效期至\s*[:]?\s*(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})",
r"检验有效期\s*[:]?\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日",
r"有效期至\s*[:]?\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日",
r"检验有效期至\s*[:]?\s*(\d{4})\s*年\s*(\d{1,2})\s*月(?!\s*\d{1,2}\s*日)",
r"检验有效期至\s*[:]?\s*(\d{4})年(\d{1,2})月(?!\d{1,2}日)",
r"检验有效期至\s*[:]?\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*",
r"(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日\s*[(]?\s*检验",
]
for pat in patterns:
m = re.search(pat, t)
if not m:
continue
y, mo = int(m.group(1)), int(m.group(2))
if mo < 1 or mo > 12 or y < 2000 or y > 2100:
continue
g3 = m.group(3) if m.lastindex and m.lastindex >= 3 else None
if g3 is not None and str(g3).strip() != "":
try:
d = int(g3)
except ValueError:
continue
if 1 <= d <= 31:
try:
return date(y, mo, d)
except ValueError:
continue
return month_end(y, mo)
return None
def ocr_inspection_from_path(path: str) -> Optional[date]:
ext = os.path.splitext(path)[1].lower()
if ext == ".pdf":
doc = fitz.open(path)
try:
n = min(len(doc), MAX_PDF_PAGES)
for i in range(n):
page = doc.load_page(i)
mat = fitz.Matrix(PDF_ZOOM, PDF_ZOOM)
pix = page.get_pixmap(matrix=mat, alpha=False)
mode = "RGB" if pix.n < 4 else "RGBA"
img = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
if mode == "RGBA":
img = img.convert("RGB")
w, h = img.size
if max(w, h) > MAX_IMAGE_SIDE:
s = MAX_IMAGE_SIDE / max(w, h)
img = img.resize((int(w * s), int(h * s)), Image.LANCZOS)
t = ocr_image(img)
d = parse_inspection_date(t)
if d:
return d
finally:
doc.close()
return None
t = ocr_file(path)
return parse_inspection_date(t)
def parse_excel_inspection_date(val) -> Optional[date]:
if val is None or (isinstance(val, float) and pd.isna(val)):
return None
ts = pd.to_datetime(val, errors="coerce")
if pd.isna(ts):
return None
return ts.date()
def next_inspection_for_plate(paths: List[str]) -> Optional[date]:
ordered = file_try_order(paths)
for p in ordered[:4]:
d = ocr_inspection_from_path(p)
if d:
return d
return None
def main() -> None:
if not os.path.isfile(TESSERACT):
raise SystemExit(f"未找到 tesseract: {TESSERACT}")
today = date.today()
plate_files = build_plate_files()
df = pd.read_excel(EXCEL_IN, sheet_name="车辆信息", header=0, engine="openpyxl")
plates = [str(x).strip() if pd.notna(x) else "" for x in df["车牌号"]]
excel_inspection = (
df[EXCEL_INSPECTION_COL]
if EXCEL_INSPECTION_COL in df.columns
else pd.Series([pd.NA] * len(df))
)
tasks: List[Tuple[int, str, List[str]]] = []
for idx, plate in enumerate(plates):
if plate and plate_files.get(plate):
tasks.append((idx, plate, plate_files[plate]))
results: Dict[int, Optional[date]] = {}
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
fut_to_idx = {
ex.submit(next_inspection_for_plate, fl): idx for idx, _p, fl in tasks
}
for fut in as_completed(fut_to_idx):
idx = fut_to_idx[fut]
try:
results[idx] = fut.result()
except Exception:
results[idx] = None
out_rows = []
for idx, plate in enumerate(plates):
if not plate:
out_rows.append(
{
"车牌号": plate,
"行驶证照片是否存在": "",
"行驶证下次检验日期": "",
"行驶证是否过期": "",
}
)
continue
files = plate_files.get(plate, [])
if not files:
out_rows.append(
{
"车牌号": plate,
"行驶证照片是否存在": "",
"行驶证下次检验日期": "",
"行驶证是否过期": "",
}
)
continue
ocr_d = results.get(idx)
excel_d = parse_excel_inspection_date(excel_inspection.iloc[idx])
final_d = ocr_d or excel_d
if final_d:
date_str = final_d.strftime("%Y-%m-%d")
expired = "" if final_d < today else ""
else:
date_str = ""
expired = ""
out_rows.append(
{
"车牌号": plate,
"行驶证照片是否存在": "",
"行驶证下次检验日期": date_str,
"行驶证是否过期": expired,
}
)
out_df = pd.DataFrame(out_rows)
os.makedirs(os.path.dirname(EXCEL_OUT), exist_ok=True)
with pd.ExcelWriter(EXCEL_OUT, engine="openpyxl") as w:
out_df.to_excel(w, index=False, sheet_name="行驶证OCR")
has_date = out_df["行驶证下次检验日期"] != ""
with_photo = out_df["行驶证照片是否存在"] == ""
ocr_ok = sum(1 for idx, _, _ in tasks if results.get(idx))
excel_only = 0
for idx, plate in enumerate(plates):
if not plate or not plate_files.get(plate):
continue
if (
results.get(idx) is None
and parse_excel_inspection_date(excel_inspection.iloc[idx]) is not None
):
excel_only += 1
print(
f"完成: {EXCEL_OUT}\n"
f"今日日期: {today.isoformat()}\n"
f"有照片: {int(with_photo.sum())}\n"
f"OCR 识别到检验日期: {ocr_ok}\n"
f"OCR 未识别、第三列由表内「{EXCEL_INSPECTION_COL}」补全: {excel_only}\n"
f"有检验日期(第三列非空): {int(has_date.sum())}"
)
if __name__ == "__main__":
main()