Files
ONE-OS/scripts/fill_insurance_import_from_policies.py
王冕 a27e3b8e43 feat: sync full workspace including web modules, docs, and configurations to Gitea
Optimized the root .gitignore to exclude virtual environments, node modules,
and temp folders to ensure clean and lightweight version tracking.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-09 18:12:25 +08:00

209 lines
7.0 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""扫描商业/交强/超赔目录下保单文件,按导入模板 15 列结构生成已填报 Excel。"""
from __future__ import annotations
import calendar
import os
import re
from typing import Any, Dict, List, Optional, Tuple
import pandas as pd
TEMPLATE_IN = "/Users/sylvawong/Downloads/保险采购信息导入模板-1776319598586.xlsx"
VEHICLE_XLSX = "/Users/sylvawong/Downloads/车辆信息-1776270214730.xlsx"
BASE_POLICY_DIR = "/Users/sylvawong/Desktop/保单-2025.7-2026.4"
DIR_COMMERCIAL = os.path.join(BASE_POLICY_DIR, "商业")
DIR_COMPULSORY = os.path.join(BASE_POLICY_DIR, "交强")
DIR_EXCESS = os.path.join(BASE_POLICY_DIR, "超赔")
OUTPUT_XLSX = "/Users/sylvawong/Desktop/CURSOR/ONE-OS/保险采购信息导入模板-已填报.xlsx"
SHEET_NAME = "保险采购信息导入模板"
PRODUCT_COMMERCIAL = "商业险"
PRODUCT_COMPULSORY = "交强险"
PRODUCT_EXCESS = "超赔险"
PLATE_CORE = r"[\u4e00-\u9fa5][A-Z0-9\u4e00-\u9fa5·挂]{5,14}"
PAT_COMMERCIAL = re.compile(rf"^({PLATE_CORE})[_\-]?商业", re.I)
PAT_COMPULSORY = re.compile(rf"交强险[_\-]({PLATE_CORE})[_\-]", re.I)
PAT_COMPULSORY_ALT = re.compile(rf"\d{{4}}交强险[_]({PLATE_CORE})[_]", re.I)
PAT_COMPULSORY_TIGHT = re.compile(rf"^\d{{4}}交强险({PLATE_CORE})[_]", re.I)
PAT_COMPULSORY_PLATE_FIRST = re.compile(rf"^({PLATE_CORE})_交强险", re.I)
PAT_EXCESS = re.compile(rf"^({PLATE_CORE})[_\-]?超赔", re.I)
PAT_PERIOD = re.compile(r"(\d{4})\.(\d{1,2})-(\d{4})\.(\d{1,2})")
PAT_NEW_CAR_COMM = re.compile(r"^新车_商业_(.+)\.(pdf|jpg|jpeg|png)$", re.I)
PAT_NEW_CAR_JQ = re.compile(r"^新车_交强_(.+)\.(pdf|jpg|jpeg|png)$", re.I)
def iter_policy_files(root: str) -> List[str]:
out: List[str] = []
if not os.path.isdir(root):
return out
for dp, _, fns in os.walk(root):
for fn in fns:
low = fn.lower()
if low.endswith((".pdf", ".jpg", ".jpeg", ".png")):
out.append(os.path.join(dp, fn))
return sorted(out)
def plate_from_filename(path: str, kind: str) -> Optional[str]:
fn = os.path.basename(path)
if kind == "commercial":
m = PAT_COMMERCIAL.match(fn)
return m.group(1) if m else None
if kind == "compulsory":
m = (
PAT_COMPULSORY.search(fn)
or PAT_COMPULSORY_ALT.search(fn)
or PAT_COMPULSORY_TIGHT.search(fn)
or PAT_COMPULSORY_PLATE_FIRST.search(fn)
)
return m.group(1) if m else None
if kind == "excess":
m = PAT_EXCESS.match(fn)
return m.group(1) if m else None
return None
def period_from_filename(fn: str) -> Tuple[Optional[pd.Timestamp], Optional[pd.Timestamp]]:
m = PAT_PERIOD.search(fn)
if not m:
return None, None
y1, mo1, y2, mo2 = map(int, m.groups())
start = pd.Timestamp(year=y1, month=mo1, day=1)
last = calendar.monthrange(y2, mo2)[1]
end = pd.Timestamp(year=y2, month=mo2, day=last)
return start, end
def policy_no_from_filename(path: str, product: str) -> Optional[str]:
fn = os.path.basename(path)
if product == PRODUCT_COMMERCIAL:
m = PAT_NEW_CAR_COMM.match(fn)
if m:
return m.group(1).strip()
if product == PRODUCT_COMPULSORY:
m = PAT_NEW_CAR_JQ.match(fn)
if m:
return m.group(1).strip()
return None
def load_vehicle_index() -> Dict[str, pd.Series]:
vdf = pd.read_excel(VEHICLE_XLSX, sheet_name=0, header=0, engine="openpyxl")
vdf = vdf.drop_duplicates(subset=["车牌号"], keep="first")
idx: Dict[str, pd.Series] = {}
for _, row in vdf.iterrows():
p = str(row["车牌号"]).strip() if pd.notna(row["车牌号"]) else ""
if p:
idx[p] = row
return idx
def split_city(运营城市: Any, 所属省份: Any) -> Tuple[str, str]:
prov = str(所属省份).strip() if pd.notna(所属省份) else ""
city = ""
if pd.notna(运营城市):
s = str(运营城市).strip()
if "-" in s:
parts = s.split("-", 1)
if not prov:
prov = parts[0].strip()
city = parts[1].strip() if len(parts) > 1 else ""
else:
city = s
return prov, city
def empty_row_template(columns: List[str]) -> Dict[str, Any]:
return {c: None for c in columns}
def build_rows_for_dir(
root: str,
kind: str,
product: str,
vidx: Dict[str, pd.Series],
columns: List[str],
) -> List[Dict[str, Any]]:
rows: List[Dict[str, Any]] = []
for path in iter_policy_files(root):
fn = os.path.basename(path)
plate = plate_from_filename(path, kind) or ""
row = empty_row_template(columns)
pno = policy_no_from_filename(path, product)
row["保单号"] = pno if pno else None
row["车牌号"] = plate if plate else None
row["保险产品"] = product
st, en = period_from_filename(fn)
if st is not None:
row["保险开始时间"] = st
if en is not None:
row["保险结束时间"] = en
if plate and plate in vidx:
vr = vidx[plate]
vin = vr.get("VIN")
row["保险公司"] = vin if pd.notna(vin) else None
prov, city = split_city(vr.get("运营城市"), vr.get("所属省份"))
row[""] = prov or None
row[""] = city or None
rows.append(row)
return rows
def main() -> None:
tpl = pd.read_excel(TEMPLATE_IN, sheet_name=0, header=0)
columns = tpl.columns.tolist()
required = [
"保单号",
"车牌号",
"保险公司",
"保险产品",
"保险费用金额",
"保险额度",
"",
"",
"保险开始时间",
"保险结束时间",
"缴费时间",
"保险公司联系人",
"保险公司联系电话",
"保险采购人",
"保险采购人电话",
]
if columns != required:
raise SystemExit(f"模板列与预期不一致: {columns}")
vidx = load_vehicle_index()
all_rows: List[Dict[str, Any]] = []
all_rows.extend(
build_rows_for_dir(DIR_COMMERCIAL, "commercial", PRODUCT_COMMERCIAL, vidx, columns)
)
all_rows.extend(
build_rows_for_dir(DIR_COMPULSORY, "compulsory", PRODUCT_COMPULSORY, vidx, columns)
)
all_rows.extend(build_rows_for_dir(DIR_EXCESS, "excess", PRODUCT_EXCESS, vidx, columns))
out_df = pd.DataFrame(all_rows, columns=columns)
os.makedirs(os.path.dirname(OUTPUT_XLSX), exist_ok=True)
with pd.ExcelWriter(OUTPUT_XLSX, engine="openpyxl") as w:
out_df.to_excel(w, index=False, sheet_name=SHEET_NAME)
print(f"已生成: {OUTPUT_XLSX}")
print(f"总行数(每个保单文件一行): {len(out_df)}")
for p, n in out_df["保险产品"].value_counts().items():
print(f" {p}: {int(n)}")
miss_plate = out_df["车牌号"].isna() | (out_df["车牌号"].astype(str).str.strip() == "")
print(f" 未解析到车牌的行: {int(miss_plate.sum())}")
if __name__ == "__main__":
main()