Files
ONE-OS/scripts/match_insurance_policy_files.py
王冕 a27e3b8e43 feat: sync full workspace including web modules, docs, and configurations to Gitea
Optimized the root .gitignore to exclude virtual environments, node modules,
and temp folders to ensure clean and lightweight version tracking.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-09 18:12:25 +08:00

217 lines
7.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""根据车辆表 + 保险采购导入模板,匹配本地商业/交强/超赔保单文件夹中的 PDF导出分类 Excel。"""
from __future__ import annotations
import os
import re
from typing import Dict, List, Optional, Tuple
import pandas as pd
# 可按需修改
VEHICLE_XLSX = "/Users/sylvawong/Downloads/车辆信息-1776270214730.xlsx"
IMPORT_TEMPLATE = "/Users/sylvawong/Downloads/保险采购信息导入模板-1776306962989.xlsx"
BASE_POLICY_DIR = "/Users/sylvawong/Desktop/保单-2025.7-2026.4"
DIR_COMMERCIAL = os.path.join(BASE_POLICY_DIR, "商业")
DIR_COMPULSORY = os.path.join(BASE_POLICY_DIR, "交强")
DIR_EXCESS = os.path.join(BASE_POLICY_DIR, "超赔")
OUTPUT_XLSX = "/Users/sylvawong/Desktop/CURSOR/ONE-OS/保险文件匹配结果.xlsx"
PRODUCT_COMMERCIAL = "商业险"
PRODUCT_COMPULSORY = "交强险"
PRODUCT_EXCESS = "超赔险"
# 车牌:省份简称 + 字母/数字/挂 等(与行驶证解析类似,略宽松)
PLATE_CORE = r"[\u4e00-\u9fa5][A-Z0-9\u4e00-\u9fa5·挂]{5,14}"
PAT_COMMERCIAL = re.compile(rf"^({PLATE_CORE})[_\-]?商业", re.I)
PAT_COMPULSORY = re.compile(rf"交强险[_\-]({PLATE_CORE})[_\-]", re.I)
PAT_COMPULSORY_ALT = re.compile(rf"\d{{4}}交强险[_]({PLATE_CORE})[_]", re.I)
# 2025交强险粤AGP4597_公司.pdf交强险与车牌之间无下划线
PAT_COMPULSORY_TIGHT = re.compile(rf"^\d{{4}}交强险({PLATE_CORE})[_]", re.I)
# 京A13691F_交强险2025.7-2026.7.pdf
PAT_COMPULSORY_PLATE_FIRST = re.compile(rf"^({PLATE_CORE})_交强险", re.I)
PAT_EXCESS = re.compile(rf"^({PLATE_CORE})[_\-]?超赔", re.I)
def iter_policy_files(root: str) -> List[str]:
out: List[str] = []
if not os.path.isdir(root):
return out
for dp, _, fns in os.walk(root):
for fn in fns:
low = fn.lower()
if low.endswith(".pdf") or low.endswith((".jpg", ".jpeg", ".png")):
out.append(os.path.join(dp, fn))
return sorted(out)
def plate_from_filename(path: str, kind: str) -> Optional[str]:
fn = os.path.basename(path)
if kind == "commercial":
m = PAT_COMMERCIAL.match(fn)
return m.group(1) if m else None
if kind == "compulsory":
m = (
PAT_COMPULSORY.search(fn)
or PAT_COMPULSORY_ALT.search(fn)
or PAT_COMPULSORY_TIGHT.search(fn)
or PAT_COMPULSORY_PLATE_FIRST.search(fn)
)
return m.group(1) if m else None
if kind == "excess":
m = PAT_EXCESS.match(fn)
return m.group(1) if m else None
return None
def build_plate_files(root: str, kind: str) -> Dict[str, List[str]]:
d: Dict[str, List[str]] = {}
unmapped: List[str] = []
for p in iter_policy_files(root):
pl = plate_from_filename(p, kind)
if pl:
d.setdefault(pl, []).append(p)
else:
unmapped.append(p)
return d, unmapped
def normalize_plate(s: str) -> str:
return str(s).strip() if pd.notna(s) else ""
def load_template_by_product() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
df = pd.read_excel(IMPORT_TEMPLATE, sheet_name=0, header=0)
df = df.dropna(how="all")
df = df[df["保险产品"].notna()]
jq = df[df["保险产品"] == PRODUCT_COMPULSORY].copy()
sy = df[df["保险产品"] == PRODUCT_COMMERCIAL].copy()
cp = df[df["保险产品"] == PRODUCT_EXCESS].copy()
return jq, sy, cp
def vehicle_subset(vdf: pd.DataFrame) -> pd.DataFrame:
cols = [
"车牌号",
"VIN",
"所属省份",
"运营城市",
"登记所有权",
"运营公司",
"车辆类型",
"品牌",
"型号",
"保险状态",
]
keep = [c for c in cols if c in vdf.columns]
return vdf[keep].drop_duplicates(subset=["车牌号"])
def merge_sheet(
v_base: pd.DataFrame,
tpl: pd.DataFrame,
plate_files: Dict[str, List[str]],
product_name: str,
) -> pd.DataFrame:
"""以车辆表车牌为左表;模板与保单文件按车牌关联。"""
if tpl is not None and len(tpl) > 0:
tpl_idx = tpl.drop_duplicates(subset=["车牌号"], keep="first").set_index("车牌号")
else:
tpl_idx = pd.DataFrame().set_index(pd.Index([], name="车牌号"))
rows = []
for _, vr in v_base.iterrows():
plate = normalize_plate(vr["车牌号"])
if not plate:
continue
files = plate_files.get(plate, [])
file_cell = "; ".join(files) if files else ""
if len(tpl_idx.index) and plate in tpl_idx.index:
tr = tpl_idx.loc[plate]
if isinstance(tr, pd.DataFrame):
tr = tr.iloc[0]
row = tr.to_dict()
else:
row = {c: None for c in tpl.columns} if len(tpl.columns) else {}
row["车牌号"] = plate
row["保险产品"] = product_name
for k, val in vr.to_dict().items():
row.setdefault(k, val)
row["保单文件路径"] = file_cell
row["保单文件数量"] = len(files)
rows.append(row)
out = pd.DataFrame(rows)
if "保险产品" in out.columns:
out["保险产品"] = product_name
return out
def unmapped_sheet(
commercial_um: List[str],
compulsory_um: List[str],
excess_um: List[str],
) -> pd.DataFrame:
rows = []
for p in commercial_um:
rows.append({"险种文件夹": "商业", "文件路径": p, "文件名": os.path.basename(p)})
for p in compulsory_um:
rows.append({"险种文件夹": "交强", "文件路径": p, "文件名": os.path.basename(p)})
for p in excess_um:
rows.append({"险种文件夹": "超赔", "文件路径": p, "文件名": os.path.basename(p)})
return pd.DataFrame(rows)
def main() -> None:
vdf = pd.read_excel(VEHICLE_XLSX, sheet_name=0, header=0, engine="openpyxl")
v_base = vehicle_subset(vdf)
jq_tpl, sy_tpl, cp_tpl = load_template_by_product()
sy_map, sy_um = build_plate_files(DIR_COMMERCIAL, "commercial")
jq_map, jq_um = build_plate_files(DIR_COMPULSORY, "compulsory")
cp_map, cp_um = build_plate_files(DIR_EXCESS, "excess")
df_sy = merge_sheet(v_base, sy_tpl, sy_map, PRODUCT_COMMERCIAL)
df_jq = merge_sheet(v_base, jq_tpl, jq_map, PRODUCT_COMPULSORY)
tpl_cols = sy_tpl.columns.tolist() if len(sy_tpl.columns) else jq_tpl.columns.tolist()
if len(cp_tpl) > 0:
df_cp = merge_sheet(v_base, cp_tpl, cp_map, PRODUCT_EXCESS)
else:
df_cp = merge_sheet(
v_base, pd.DataFrame(columns=tpl_cols), cp_map, PRODUCT_EXCESS
)
df_um = unmapped_sheet(sy_um, jq_um, cp_um)
os.makedirs(os.path.dirname(OUTPUT_XLSX), exist_ok=True)
with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as w:
df_jq.to_excel(w, index=False, sheet_name="交强险")
df_sy.to_excel(w, index=False, sheet_name="商业险")
df_cp.to_excel(w, index=False, sheet_name="超赔险")
df_um.to_excel(w, index=False, sheet_name="未能解析车牌的文件")
def stat(df: pd.DataFrame) -> Tuple[int, int]:
n = len(df)
m = (df["保单文件路径"].astype(str).str.len() > 0).sum() if "保单文件路径" in df else 0
return n, int(m)
print(f"已生成: {OUTPUT_XLSX}")
for name, df in [
("交强险", df_jq),
("商业险", df_sy),
("超赔险", df_cp),
]:
n, m = stat(df)
print(f" {name}: 车辆行数 {n}, 已匹配到本地保单文件 {m}")
print(f" 未能解析车牌的文件: {len(df_um)}")
if __name__ == "__main__":
main()