#!/usr/bin/env python3 # -*- coding: utf-8 -*- """扫描商业/交强/超赔目录下保单文件,按导入模板 15 列结构生成已填报 Excel。""" from __future__ import annotations import calendar import os import re from typing import Any, Dict, List, Optional, Tuple import pandas as pd TEMPLATE_IN = "/Users/sylvawong/Downloads/保险采购信息导入模板-1776319598586.xlsx" VEHICLE_XLSX = "/Users/sylvawong/Downloads/车辆信息-1776270214730.xlsx" BASE_POLICY_DIR = "/Users/sylvawong/Desktop/保单-2025.7-2026.4" DIR_COMMERCIAL = os.path.join(BASE_POLICY_DIR, "商业") DIR_COMPULSORY = os.path.join(BASE_POLICY_DIR, "交强") DIR_EXCESS = os.path.join(BASE_POLICY_DIR, "超赔") OUTPUT_XLSX = "/Users/sylvawong/Desktop/CURSOR/ONE-OS/保险采购信息导入模板-已填报.xlsx" SHEET_NAME = "保险采购信息导入模板" PRODUCT_COMMERCIAL = "商业险" PRODUCT_COMPULSORY = "交强险" PRODUCT_EXCESS = "超赔险" PLATE_CORE = r"[\u4e00-\u9fa5][A-Z0-9\u4e00-\u9fa5·挂]{5,14}" PAT_COMMERCIAL = re.compile(rf"^({PLATE_CORE})[_\-]?商业", re.I) PAT_COMPULSORY = re.compile(rf"交强险[_\-]({PLATE_CORE})[_\-]", re.I) PAT_COMPULSORY_ALT = re.compile(rf"\d{{4}}交强险[_]({PLATE_CORE})[_]", re.I) PAT_COMPULSORY_TIGHT = re.compile(rf"^\d{{4}}交强险({PLATE_CORE})[_]", re.I) PAT_COMPULSORY_PLATE_FIRST = re.compile(rf"^({PLATE_CORE})_交强险", re.I) PAT_EXCESS = re.compile(rf"^({PLATE_CORE})[_\-]?超赔", re.I) PAT_PERIOD = re.compile(r"(\d{4})\.(\d{1,2})-(\d{4})\.(\d{1,2})") PAT_NEW_CAR_COMM = re.compile(r"^新车_商业_(.+)\.(pdf|jpg|jpeg|png)$", re.I) PAT_NEW_CAR_JQ = re.compile(r"^新车_交强_(.+)\.(pdf|jpg|jpeg|png)$", re.I) def iter_policy_files(root: str) -> List[str]: out: List[str] = [] if not os.path.isdir(root): return out for dp, _, fns in os.walk(root): for fn in fns: low = fn.lower() if low.endswith((".pdf", ".jpg", ".jpeg", ".png")): out.append(os.path.join(dp, fn)) return sorted(out) def plate_from_filename(path: str, kind: str) -> Optional[str]: fn = os.path.basename(path) if kind == "commercial": m = PAT_COMMERCIAL.match(fn) return m.group(1) if m else None if kind == "compulsory": m = ( PAT_COMPULSORY.search(fn) or PAT_COMPULSORY_ALT.search(fn) or PAT_COMPULSORY_TIGHT.search(fn) or PAT_COMPULSORY_PLATE_FIRST.search(fn) ) return m.group(1) if m else None if kind == "excess": m = PAT_EXCESS.match(fn) return m.group(1) if m else None return None def period_from_filename(fn: str) -> Tuple[Optional[pd.Timestamp], Optional[pd.Timestamp]]: m = PAT_PERIOD.search(fn) if not m: return None, None y1, mo1, y2, mo2 = map(int, m.groups()) start = pd.Timestamp(year=y1, month=mo1, day=1) last = calendar.monthrange(y2, mo2)[1] end = pd.Timestamp(year=y2, month=mo2, day=last) return start, end def policy_no_from_filename(path: str, product: str) -> Optional[str]: fn = os.path.basename(path) if product == PRODUCT_COMMERCIAL: m = PAT_NEW_CAR_COMM.match(fn) if m: return m.group(1).strip() if product == PRODUCT_COMPULSORY: m = PAT_NEW_CAR_JQ.match(fn) if m: return m.group(1).strip() return None def load_vehicle_index() -> Dict[str, pd.Series]: vdf = pd.read_excel(VEHICLE_XLSX, sheet_name=0, header=0, engine="openpyxl") vdf = vdf.drop_duplicates(subset=["车牌号"], keep="first") idx: Dict[str, pd.Series] = {} for _, row in vdf.iterrows(): p = str(row["车牌号"]).strip() if pd.notna(row["车牌号"]) else "" if p: idx[p] = row return idx def split_city(运营城市: Any, 所属省份: Any) -> Tuple[str, str]: prov = str(所属省份).strip() if pd.notna(所属省份) else "" city = "" if pd.notna(运营城市): s = str(运营城市).strip() if "-" in s: parts = s.split("-", 1) if not prov: prov = parts[0].strip() city = parts[1].strip() if len(parts) > 1 else "" else: city = s return prov, city def empty_row_template(columns: List[str]) -> Dict[str, Any]: return {c: None for c in columns} def build_rows_for_dir( root: str, kind: str, product: str, vidx: Dict[str, pd.Series], columns: List[str], ) -> List[Dict[str, Any]]: rows: List[Dict[str, Any]] = [] for path in iter_policy_files(root): fn = os.path.basename(path) plate = plate_from_filename(path, kind) or "" row = empty_row_template(columns) pno = policy_no_from_filename(path, product) row["保单号"] = pno if pno else None row["车牌号"] = plate if plate else None row["保险产品"] = product st, en = period_from_filename(fn) if st is not None: row["保险开始时间"] = st if en is not None: row["保险结束时间"] = en if plate and plate in vidx: vr = vidx[plate] vin = vr.get("VIN") row["保险公司"] = vin if pd.notna(vin) else None prov, city = split_city(vr.get("运营城市"), vr.get("所属省份")) row["省"] = prov or None row["市"] = city or None rows.append(row) return rows def main() -> None: tpl = pd.read_excel(TEMPLATE_IN, sheet_name=0, header=0) columns = tpl.columns.tolist() required = [ "保单号", "车牌号", "保险公司", "保险产品", "保险费用金额", "保险额度", "省", "市", "保险开始时间", "保险结束时间", "缴费时间", "保险公司联系人", "保险公司联系电话", "保险采购人", "保险采购人电话", ] if columns != required: raise SystemExit(f"模板列与预期不一致: {columns}") vidx = load_vehicle_index() all_rows: List[Dict[str, Any]] = [] all_rows.extend( build_rows_for_dir(DIR_COMMERCIAL, "commercial", PRODUCT_COMMERCIAL, vidx, columns) ) all_rows.extend( build_rows_for_dir(DIR_COMPULSORY, "compulsory", PRODUCT_COMPULSORY, vidx, columns) ) all_rows.extend(build_rows_for_dir(DIR_EXCESS, "excess", PRODUCT_EXCESS, vidx, columns)) out_df = pd.DataFrame(all_rows, columns=columns) os.makedirs(os.path.dirname(OUTPUT_XLSX), exist_ok=True) with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as w: out_df.to_excel(w, index=False, sheet_name=SHEET_NAME) print(f"已生成: {OUTPUT_XLSX}") print(f"总行数(每个保单文件一行): {len(out_df)}") for p, n in out_df["保险产品"].value_counts().items(): print(f" {p}: {int(n)}") miss_plate = out_df["车牌号"].isna() | (out_df["车牌号"].astype(str).str.strip() == "") print(f" 未解析到车牌的行: {int(miss_plate.sum())}") if __name__ == "__main__": main()