#!/usr/bin/env python3 # -*- coding: utf-8 -*- """根据车辆表 + 保险采购导入模板,匹配本地商业/交强/超赔保单文件夹中的 PDF,导出分类 Excel。""" from __future__ import annotations import os import re from typing import Dict, List, Optional, Tuple import pandas as pd # 可按需修改 VEHICLE_XLSX = "/Users/sylvawong/Downloads/车辆信息-1776270214730.xlsx" IMPORT_TEMPLATE = "/Users/sylvawong/Downloads/保险采购信息导入模板-1776306962989.xlsx" BASE_POLICY_DIR = "/Users/sylvawong/Desktop/保单-2025.7-2026.4" DIR_COMMERCIAL = os.path.join(BASE_POLICY_DIR, "商业") DIR_COMPULSORY = os.path.join(BASE_POLICY_DIR, "交强") DIR_EXCESS = os.path.join(BASE_POLICY_DIR, "超赔") OUTPUT_XLSX = "/Users/sylvawong/Desktop/CURSOR/ONE-OS/保险文件匹配结果.xlsx" PRODUCT_COMMERCIAL = "商业险" PRODUCT_COMPULSORY = "交强险" PRODUCT_EXCESS = "超赔险" # 车牌:省份简称 + 字母/数字/挂 等(与行驶证解析类似,略宽松) PLATE_CORE = r"[\u4e00-\u9fa5][A-Z0-9\u4e00-\u9fa5·挂]{5,14}" PAT_COMMERCIAL = re.compile(rf"^({PLATE_CORE})[_\-]?商业", re.I) PAT_COMPULSORY = re.compile(rf"交强险[_\-]({PLATE_CORE})[_\-]", re.I) PAT_COMPULSORY_ALT = re.compile(rf"\d{{4}}交强险[_]({PLATE_CORE})[_]", re.I) # 2025交强险粤AGP4597_公司.pdf(交强险与车牌之间无下划线) PAT_COMPULSORY_TIGHT = re.compile(rf"^\d{{4}}交强险({PLATE_CORE})[_]", re.I) # 京A13691F_交强险2025.7-2026.7.pdf PAT_COMPULSORY_PLATE_FIRST = re.compile(rf"^({PLATE_CORE})_交强险", re.I) PAT_EXCESS = re.compile(rf"^({PLATE_CORE})[_\-]?超赔", re.I) def iter_policy_files(root: str) -> List[str]: out: List[str] = [] if not os.path.isdir(root): return out for dp, _, fns in os.walk(root): for fn in fns: low = fn.lower() if low.endswith(".pdf") or low.endswith((".jpg", ".jpeg", ".png")): out.append(os.path.join(dp, fn)) return sorted(out) def plate_from_filename(path: str, kind: str) -> Optional[str]: fn = os.path.basename(path) if kind == "commercial": m = PAT_COMMERCIAL.match(fn) return m.group(1) if m else None if kind == "compulsory": m = ( PAT_COMPULSORY.search(fn) or PAT_COMPULSORY_ALT.search(fn) or PAT_COMPULSORY_TIGHT.search(fn) or PAT_COMPULSORY_PLATE_FIRST.search(fn) ) return m.group(1) if m else None if kind == "excess": m = PAT_EXCESS.match(fn) return m.group(1) if m else None return None def build_plate_files(root: str, kind: str) -> Dict[str, List[str]]: d: Dict[str, List[str]] = {} unmapped: List[str] = [] for p in iter_policy_files(root): pl = plate_from_filename(p, kind) if pl: d.setdefault(pl, []).append(p) else: unmapped.append(p) return d, unmapped def normalize_plate(s: str) -> str: return str(s).strip() if pd.notna(s) else "" def load_template_by_product() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: df = pd.read_excel(IMPORT_TEMPLATE, sheet_name=0, header=0) df = df.dropna(how="all") df = df[df["保险产品"].notna()] jq = df[df["保险产品"] == PRODUCT_COMPULSORY].copy() sy = df[df["保险产品"] == PRODUCT_COMMERCIAL].copy() cp = df[df["保险产品"] == PRODUCT_EXCESS].copy() return jq, sy, cp def vehicle_subset(vdf: pd.DataFrame) -> pd.DataFrame: cols = [ "车牌号", "VIN", "所属省份", "运营城市", "登记所有权", "运营公司", "车辆类型", "品牌", "型号", "保险状态", ] keep = [c for c in cols if c in vdf.columns] return vdf[keep].drop_duplicates(subset=["车牌号"]) def merge_sheet( v_base: pd.DataFrame, tpl: pd.DataFrame, plate_files: Dict[str, List[str]], product_name: str, ) -> pd.DataFrame: """以车辆表车牌为左表;模板与保单文件按车牌关联。""" if tpl is not None and len(tpl) > 0: tpl_idx = tpl.drop_duplicates(subset=["车牌号"], keep="first").set_index("车牌号") else: tpl_idx = pd.DataFrame().set_index(pd.Index([], name="车牌号")) rows = [] for _, vr in v_base.iterrows(): plate = normalize_plate(vr["车牌号"]) if not plate: continue files = plate_files.get(plate, []) file_cell = "; ".join(files) if files else "" if len(tpl_idx.index) and plate in tpl_idx.index: tr = tpl_idx.loc[plate] if isinstance(tr, pd.DataFrame): tr = tr.iloc[0] row = tr.to_dict() else: row = {c: None for c in tpl.columns} if len(tpl.columns) else {} row["车牌号"] = plate row["保险产品"] = product_name for k, val in vr.to_dict().items(): row.setdefault(k, val) row["保单文件路径"] = file_cell row["保单文件数量"] = len(files) rows.append(row) out = pd.DataFrame(rows) if "保险产品" in out.columns: out["保险产品"] = product_name return out def unmapped_sheet( commercial_um: List[str], compulsory_um: List[str], excess_um: List[str], ) -> pd.DataFrame: rows = [] for p in commercial_um: rows.append({"险种文件夹": "商业", "文件路径": p, "文件名": os.path.basename(p)}) for p in compulsory_um: rows.append({"险种文件夹": "交强", "文件路径": p, "文件名": os.path.basename(p)}) for p in excess_um: rows.append({"险种文件夹": "超赔", "文件路径": p, "文件名": os.path.basename(p)}) return pd.DataFrame(rows) def main() -> None: vdf = pd.read_excel(VEHICLE_XLSX, sheet_name=0, header=0, engine="openpyxl") v_base = vehicle_subset(vdf) jq_tpl, sy_tpl, cp_tpl = load_template_by_product() sy_map, sy_um = build_plate_files(DIR_COMMERCIAL, "commercial") jq_map, jq_um = build_plate_files(DIR_COMPULSORY, "compulsory") cp_map, cp_um = build_plate_files(DIR_EXCESS, "excess") df_sy = merge_sheet(v_base, sy_tpl, sy_map, PRODUCT_COMMERCIAL) df_jq = merge_sheet(v_base, jq_tpl, jq_map, PRODUCT_COMPULSORY) tpl_cols = sy_tpl.columns.tolist() if len(sy_tpl.columns) else jq_tpl.columns.tolist() if len(cp_tpl) > 0: df_cp = merge_sheet(v_base, cp_tpl, cp_map, PRODUCT_EXCESS) else: df_cp = merge_sheet( v_base, pd.DataFrame(columns=tpl_cols), cp_map, PRODUCT_EXCESS ) df_um = unmapped_sheet(sy_um, jq_um, cp_um) os.makedirs(os.path.dirname(OUTPUT_XLSX), exist_ok=True) with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as w: df_jq.to_excel(w, index=False, sheet_name="交强险") df_sy.to_excel(w, index=False, sheet_name="商业险") df_cp.to_excel(w, index=False, sheet_name="超赔险") df_um.to_excel(w, index=False, sheet_name="未能解析车牌的文件") def stat(df: pd.DataFrame) -> Tuple[int, int]: n = len(df) m = (df["保单文件路径"].astype(str).str.len() > 0).sum() if "保单文件路径" in df else 0 return n, int(m) print(f"已生成: {OUTPUT_XLSX}") for name, df in [ ("交强险", df_jq), ("商业险", df_sy), ("超赔险", df_cp), ]: n, m = stat(df) print(f" {name}: 车辆行数 {n}, 已匹配到本地保单文件 {m}") print(f" 未能解析车牌的文件: {len(df_um)} 个") if __name__ == "__main__": main()