feat: sync full workspace including web modules, docs, and configurations to Gitea
Optimized the root .gitignore to exclude virtual environments, node modules, and temp folders to ensure clean and lightweight version tracking. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
335
scripts/generate_vehicle_license_ocr_report.py
Normal file
335
scripts/generate_vehicle_license_ocr_report.py
Normal file
@@ -0,0 +1,335 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""从行驶证影像 OCR 提取检验日期,与车辆表合并导出 Excel。
|
||||
|
||||
有照片的车辆:先对影像做 OCR;若未识别到「检验有效期」,再用表内「行驶证检验有效期」补全第三列。
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import calendar
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import date
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import fitz # pymupdf
|
||||
import pandas as pd
|
||||
from PIL import Image, ImageEnhance
|
||||
|
||||
# 配置路径(可按需修改)
|
||||
EXCEL_IN = "/Users/sylvawong/Downloads/车辆信息-1776270214730.xlsx"
|
||||
PHOTO_DIR = "/Users/sylvawong/Desktop/证件信息梳理/行驶证"
|
||||
EXCEL_OUT = "/Users/sylvawong/Desktop/CURSOR/ONE-OS/车辆行驶证_OCR核对.xlsx"
|
||||
EXCEL_INSPECTION_COL = "行驶证检验有效期"
|
||||
|
||||
TESSERACT = "/opt/homebrew/bin/tesseract"
|
||||
OCR_LANG = "chi_sim+eng"
|
||||
MAX_IMAGE_SIDE = 2200
|
||||
MAX_WORKERS = 3
|
||||
MAX_PDF_PAGES = 3
|
||||
PDF_ZOOM = 2.5
|
||||
|
||||
|
||||
def extract_plate_from_filename(basename_no_ext: str) -> Optional[str]:
|
||||
s = basename_no_ext
|
||||
s = re.sub(r"\s*\(\d+\)\s*$", "", s)
|
||||
if "行驶证" in s:
|
||||
s = s.split("行驶证")[0]
|
||||
s = s.rstrip("-_· ")
|
||||
if not s:
|
||||
return None
|
||||
parts = s.split("-")
|
||||
first = parts[0]
|
||||
if (
|
||||
len(parts) >= 2
|
||||
and len(parts[1]) == 17
|
||||
and re.match(r"^[A-HJ-NPR-Z0-9]{17}$", parts[1], re.I)
|
||||
):
|
||||
cand = first
|
||||
else:
|
||||
cand = first
|
||||
m = re.match(r"^([\u4e00-\u9fa5][A-Z0-9\u4e00-\u9fa5·]{1,14})$", cand)
|
||||
if not m:
|
||||
return None
|
||||
plate = m.group(1).rstrip("-_·")
|
||||
if len(plate) < 6:
|
||||
return None
|
||||
return plate
|
||||
|
||||
|
||||
def build_plate_files() -> Dict[str, List[str]]:
|
||||
mapping: Dict[str, List[str]] = {}
|
||||
for fn in os.listdir(PHOTO_DIR):
|
||||
path = os.path.join(PHOTO_DIR, fn)
|
||||
if not os.path.isfile(path):
|
||||
continue
|
||||
base, _ = os.path.splitext(fn)
|
||||
plate = extract_plate_from_filename(base)
|
||||
if not plate:
|
||||
continue
|
||||
mapping.setdefault(plate, []).append(path)
|
||||
return mapping
|
||||
|
||||
|
||||
def file_try_order(paths: List[str]) -> List[str]:
|
||||
def score(p: str) -> Tuple[float, str]:
|
||||
fn = os.path.basename(p).lower()
|
||||
# 先年审页(通常含检验记录),再主页,PDF 略靠后(需渲染)
|
||||
if "年审" in fn:
|
||||
tier = 0.0
|
||||
elif "行驶证" in fn or "行驶" in fn:
|
||||
tier = 1.0
|
||||
else:
|
||||
tier = 1.5
|
||||
if fn.endswith(".pdf"):
|
||||
tier += 0.25
|
||||
return (tier, fn)
|
||||
|
||||
return sorted(paths, key=score)
|
||||
|
||||
|
||||
def load_image_for_ocr(path: str) -> Image.Image:
|
||||
img = Image.open(path).convert("RGB")
|
||||
w, h = img.size
|
||||
if max(w, h) > MAX_IMAGE_SIDE:
|
||||
s = MAX_IMAGE_SIDE / max(w, h)
|
||||
img = img.resize((int(w * s), int(h * s)), Image.LANCZOS)
|
||||
return img
|
||||
|
||||
|
||||
def ocr_image(img: Image.Image) -> str:
|
||||
gray = img.convert("L")
|
||||
gray = ImageEnhance.Contrast(gray).enhance(1.35)
|
||||
fd, tmp = tempfile.mkstemp(suffix=".png")
|
||||
os.close(fd)
|
||||
try:
|
||||
gray.save(tmp, format="PNG")
|
||||
cmd = [
|
||||
TESSERACT,
|
||||
tmp,
|
||||
"stdout",
|
||||
"-l",
|
||||
OCR_LANG,
|
||||
"--psm",
|
||||
"6",
|
||||
"-c",
|
||||
"preserve_interword_spaces=1",
|
||||
]
|
||||
r = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=180,
|
||||
)
|
||||
if r.returncode != 0:
|
||||
return ""
|
||||
return r.stdout or ""
|
||||
finally:
|
||||
try:
|
||||
os.unlink(tmp)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def ocr_file(path: str) -> str:
|
||||
try:
|
||||
img = load_image_for_ocr(path)
|
||||
return ocr_image(img)
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def month_end(y: int, m: int) -> date:
|
||||
last = calendar.monthrange(y, m)[1]
|
||||
return date(y, m, last)
|
||||
|
||||
|
||||
def parse_inspection_date(text: str) -> Optional[date]:
|
||||
if not text:
|
||||
return None
|
||||
t = text.replace("O", "0").replace("o", "0")
|
||||
# 检验有效期至 2027年06月04日 / 2026-03-31
|
||||
patterns = [
|
||||
r"检验有效期至\s*[::]?\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日",
|
||||
r"检验有效期至\s*[::]?\s*(\d{4})年(\d{1,2})月(\d{1,2})日",
|
||||
r"检验有效期至\s*[::]?\s*(\d{4})\s*[-/.]\s*(\d{1,2})\s*[-/.]\s*(\d{1,2})",
|
||||
r"检验有效期至\s*[::]?\s*(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})",
|
||||
r"检验有效期\s*[::]?\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日",
|
||||
r"有效期至\s*[::]?\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日",
|
||||
r"检验有效期至\s*[::]?\s*(\d{4})\s*年\s*(\d{1,2})\s*月(?!\s*\d{1,2}\s*日)",
|
||||
r"检验有效期至\s*[::]?\s*(\d{4})年(\d{1,2})月(?!\d{1,2}日)",
|
||||
r"检验有效期至\s*[::]?\s*(\d{4})\s*年\s*(\d{1,2})\s*月\s*",
|
||||
r"(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日\s*[((]?\s*检验",
|
||||
]
|
||||
for pat in patterns:
|
||||
m = re.search(pat, t)
|
||||
if not m:
|
||||
continue
|
||||
y, mo = int(m.group(1)), int(m.group(2))
|
||||
if mo < 1 or mo > 12 or y < 2000 or y > 2100:
|
||||
continue
|
||||
g3 = m.group(3) if m.lastindex and m.lastindex >= 3 else None
|
||||
if g3 is not None and str(g3).strip() != "":
|
||||
try:
|
||||
d = int(g3)
|
||||
except ValueError:
|
||||
continue
|
||||
if 1 <= d <= 31:
|
||||
try:
|
||||
return date(y, mo, d)
|
||||
except ValueError:
|
||||
continue
|
||||
return month_end(y, mo)
|
||||
return None
|
||||
|
||||
|
||||
def ocr_inspection_from_path(path: str) -> Optional[date]:
|
||||
ext = os.path.splitext(path)[1].lower()
|
||||
if ext == ".pdf":
|
||||
doc = fitz.open(path)
|
||||
try:
|
||||
n = min(len(doc), MAX_PDF_PAGES)
|
||||
for i in range(n):
|
||||
page = doc.load_page(i)
|
||||
mat = fitz.Matrix(PDF_ZOOM, PDF_ZOOM)
|
||||
pix = page.get_pixmap(matrix=mat, alpha=False)
|
||||
mode = "RGB" if pix.n < 4 else "RGBA"
|
||||
img = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
|
||||
if mode == "RGBA":
|
||||
img = img.convert("RGB")
|
||||
w, h = img.size
|
||||
if max(w, h) > MAX_IMAGE_SIDE:
|
||||
s = MAX_IMAGE_SIDE / max(w, h)
|
||||
img = img.resize((int(w * s), int(h * s)), Image.LANCZOS)
|
||||
t = ocr_image(img)
|
||||
d = parse_inspection_date(t)
|
||||
if d:
|
||||
return d
|
||||
finally:
|
||||
doc.close()
|
||||
return None
|
||||
t = ocr_file(path)
|
||||
return parse_inspection_date(t)
|
||||
|
||||
|
||||
def parse_excel_inspection_date(val) -> Optional[date]:
|
||||
if val is None or (isinstance(val, float) and pd.isna(val)):
|
||||
return None
|
||||
ts = pd.to_datetime(val, errors="coerce")
|
||||
if pd.isna(ts):
|
||||
return None
|
||||
return ts.date()
|
||||
|
||||
|
||||
def next_inspection_for_plate(paths: List[str]) -> Optional[date]:
|
||||
ordered = file_try_order(paths)
|
||||
for p in ordered[:4]:
|
||||
d = ocr_inspection_from_path(p)
|
||||
if d:
|
||||
return d
|
||||
return None
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if not os.path.isfile(TESSERACT):
|
||||
raise SystemExit(f"未找到 tesseract: {TESSERACT}")
|
||||
|
||||
today = date.today()
|
||||
plate_files = build_plate_files()
|
||||
df = pd.read_excel(EXCEL_IN, sheet_name="车辆信息", header=0, engine="openpyxl")
|
||||
plates = [str(x).strip() if pd.notna(x) else "" for x in df["车牌号"]]
|
||||
excel_inspection = (
|
||||
df[EXCEL_INSPECTION_COL]
|
||||
if EXCEL_INSPECTION_COL in df.columns
|
||||
else pd.Series([pd.NA] * len(df))
|
||||
)
|
||||
|
||||
tasks: List[Tuple[int, str, List[str]]] = []
|
||||
for idx, plate in enumerate(plates):
|
||||
if plate and plate_files.get(plate):
|
||||
tasks.append((idx, plate, plate_files[plate]))
|
||||
|
||||
results: Dict[int, Optional[date]] = {}
|
||||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
|
||||
fut_to_idx = {
|
||||
ex.submit(next_inspection_for_plate, fl): idx for idx, _p, fl in tasks
|
||||
}
|
||||
for fut in as_completed(fut_to_idx):
|
||||
idx = fut_to_idx[fut]
|
||||
try:
|
||||
results[idx] = fut.result()
|
||||
except Exception:
|
||||
results[idx] = None
|
||||
|
||||
out_rows = []
|
||||
for idx, plate in enumerate(plates):
|
||||
if not plate:
|
||||
out_rows.append(
|
||||
{
|
||||
"车牌号": plate,
|
||||
"行驶证照片是否存在": "否",
|
||||
"行驶证下次检验日期": "",
|
||||
"行驶证是否过期": "",
|
||||
}
|
||||
)
|
||||
continue
|
||||
files = plate_files.get(plate, [])
|
||||
if not files:
|
||||
out_rows.append(
|
||||
{
|
||||
"车牌号": plate,
|
||||
"行驶证照片是否存在": "否",
|
||||
"行驶证下次检验日期": "",
|
||||
"行驶证是否过期": "",
|
||||
}
|
||||
)
|
||||
continue
|
||||
ocr_d = results.get(idx)
|
||||
excel_d = parse_excel_inspection_date(excel_inspection.iloc[idx])
|
||||
final_d = ocr_d or excel_d
|
||||
if final_d:
|
||||
date_str = final_d.strftime("%Y-%m-%d")
|
||||
expired = "是" if final_d < today else "否"
|
||||
else:
|
||||
date_str = ""
|
||||
expired = ""
|
||||
out_rows.append(
|
||||
{
|
||||
"车牌号": plate,
|
||||
"行驶证照片是否存在": "是",
|
||||
"行驶证下次检验日期": date_str,
|
||||
"行驶证是否过期": expired,
|
||||
}
|
||||
)
|
||||
|
||||
out_df = pd.DataFrame(out_rows)
|
||||
os.makedirs(os.path.dirname(EXCEL_OUT), exist_ok=True)
|
||||
with pd.ExcelWriter(EXCEL_OUT, engine="openpyxl") as w:
|
||||
out_df.to_excel(w, index=False, sheet_name="行驶证OCR")
|
||||
has_date = out_df["行驶证下次检验日期"] != ""
|
||||
with_photo = out_df["行驶证照片是否存在"] == "是"
|
||||
ocr_ok = sum(1 for idx, _, _ in tasks if results.get(idx))
|
||||
excel_only = 0
|
||||
for idx, plate in enumerate(plates):
|
||||
if not plate or not plate_files.get(plate):
|
||||
continue
|
||||
if (
|
||||
results.get(idx) is None
|
||||
and parse_excel_inspection_date(excel_inspection.iloc[idx]) is not None
|
||||
):
|
||||
excel_only += 1
|
||||
print(
|
||||
f"完成: {EXCEL_OUT}\n"
|
||||
f"今日日期: {today.isoformat()}\n"
|
||||
f"有照片: {int(with_photo.sum())}\n"
|
||||
f"OCR 识别到检验日期: {ocr_ok}\n"
|
||||
f"OCR 未识别、第三列由表内「{EXCEL_INSPECTION_COL}」补全: {excel_only}\n"
|
||||
f"有检验日期(第三列非空): {int(has_date.sum())}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user