Files
王冕 a27e3b8e43 feat: sync full workspace including web modules, docs, and configurations to Gitea
Optimized the root .gitignore to exclude virtual environments, node modules,
and temp folders to ensure clean and lightweight version tracking.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-09 18:12:25 +08:00

355 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Report Validation Script
Ensures research reports meet quality standards before delivery
"""
import argparse
import re
import sys
from pathlib import Path
from typing import List, Tuple, Dict
class ReportValidator:
"""Validates research report quality"""
def __init__(self, report_path: Path):
self.report_path = report_path
self.content = self._read_report()
self.errors: List[str] = []
self.warnings: List[str] = []
def _read_report(self) -> str:
"""Read report file"""
try:
with open(self.report_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
print(f"❌ ERROR: Cannot read report: {e}")
sys.exit(1)
def validate(self) -> bool:
"""Run all validation checks"""
print(f"\n{'='*60}")
print(f"VALIDATING REPORT: {self.report_path.name}")
print(f"{'='*60}\n")
checks = [
("Executive Summary", self._check_executive_summary),
("Required Sections", self._check_required_sections),
("Citations", self._check_citations),
("Bibliography", self._check_bibliography),
("Placeholder Text", self._check_placeholders),
("Content Truncation", self._check_content_truncation),
("Word Count", self._check_word_count),
("Source Count", self._check_source_count),
("Broken Links", self._check_broken_references),
]
for check_name, check_func in checks:
print(f"⏳ Checking: {check_name}...", end=" ")
passed = check_func()
if passed:
print("✅ PASS")
else:
print("❌ FAIL")
self._print_summary()
return len(self.errors) == 0
def _check_executive_summary(self) -> bool:
"""Check executive summary exists and is under 250 words"""
pattern = r'## Executive Summary(.*?)(?=##|\Z)'
match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE)
if not match:
self.errors.append("Missing 'Executive Summary' section")
return False
summary = match.group(1).strip()
word_count = len(summary.split())
if word_count > 250:
self.warnings.append(f"Executive summary too long: {word_count} words (should be ≤250)")
if word_count < 50:
self.warnings.append(f"Executive summary too short: {word_count} words (should be ≥50)")
return True
def _check_required_sections(self) -> bool:
"""Check all required sections are present"""
required = [
"Executive Summary",
"Introduction",
"Main Analysis",
"Synthesis",
"Limitations",
"Recommendations",
"Bibliography",
"Methodology"
]
# Recommended sections (warnings if missing, not errors)
recommended = [
"Counterevidence Register",
"Claims-Evidence Table"
]
missing = []
for section in required:
if not re.search(rf'##.*{section}', self.content, re.IGNORECASE):
missing.append(section)
if missing:
self.errors.append(f"Missing sections: {', '.join(missing)}")
return False
# Check recommended sections (warnings only)
missing_recommended = []
for section in recommended:
if not re.search(rf'##.*{section}', self.content, re.IGNORECASE):
missing_recommended.append(section)
if missing_recommended:
self.warnings.append(f"Missing recommended sections (for academic rigor): {', '.join(missing_recommended)}")
return True
def _check_citations(self) -> bool:
"""Check citation format and presence"""
# Find all citation references [1], [2], etc.
citations = re.findall(r'\[(\d+)\]', self.content)
if not citations:
self.errors.append("No citations found in report")
return False
unique_citations = set(citations)
if len(unique_citations) < 10:
self.warnings.append(f"Only {len(unique_citations)} unique sources cited (recommended: ≥10)")
# Check for consecutive citation numbers
citation_nums = sorted([int(c) for c in unique_citations])
if citation_nums:
max_citation = max(citation_nums)
expected = set(range(1, max_citation + 1))
missing = expected - set(citation_nums)
if missing:
self.warnings.append(f"Non-consecutive citation numbers, missing: {sorted(missing)}")
return True
def _check_bibliography(self) -> bool:
"""Check bibliography exists, matches citations, and has no truncation placeholders"""
pattern = r'## Bibliography(.*?)(?=##|\Z)'
match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE)
if not match:
self.errors.append("Missing 'Bibliography' section")
return False
bib_section = match.group(1)
# CRITICAL: Check for truncation placeholders (2025 CiteGuard enhancement)
truncation_patterns = [
(r'\[\d+-\d+\]', 'Citation range (e.g., [8-75])'),
(r'Additional.*citations', 'Phrase "Additional citations"'),
(r'would be included', 'Phrase "would be included"'),
(r'\[\.\.\.continue', 'Pattern "[...continue"'),
(r'\[Continue with', 'Pattern "[Continue with"'),
(r'etc\.(?!\w)', 'Standalone "etc."'),
(r'and so on', 'Phrase "and so on"'),
]
for pattern_re, description in truncation_patterns:
if re.search(pattern_re, bib_section, re.IGNORECASE):
self.errors.append(f"⚠️ CRITICAL: Bibliography contains truncation placeholder: {description}")
self.errors.append(f" This makes the report UNUSABLE - complete bibliography required")
return False
# Count bibliography entries [1], [2], etc.
bib_entries = re.findall(r'^\[(\d+)\]', bib_section, re.MULTILINE)
if not bib_entries:
self.errors.append("Bibliography has no entries")
return False
# Check citation number continuity (no gaps)
bib_nums = sorted([int(n) for n in bib_entries])
if bib_nums:
expected = list(range(1, bib_nums[-1] + 1))
actual = bib_nums
missing = [n for n in expected if n not in actual]
if missing:
self.errors.append(f"Bibliography has gaps in numbering: missing {missing}")
return False
# Find citations in text
text_citations = set(re.findall(r'\[(\d+)\]', self.content))
bib_citations = set(bib_entries)
# Check all citations have bibliography entries
missing_in_bib = text_citations - bib_citations
if missing_in_bib:
self.errors.append(f"Citations missing from bibliography: {sorted(missing_in_bib)}")
return False
# Check for unused bibliography entries
unused = bib_citations - text_citations
if unused:
self.warnings.append(f"Unused bibliography entries: {sorted(unused)}")
return True
def _check_placeholders(self) -> bool:
"""Check for placeholder text that shouldn't be in final report"""
placeholders = [
'TBD', 'TODO', 'FIXME', 'XXX',
'[citation needed]', '[needs citation]',
'[placeholder]', '[TODO]', '[TBD]'
]
found_placeholders = []
for placeholder in placeholders:
if placeholder in self.content:
found_placeholders.append(placeholder)
if found_placeholders:
self.errors.append(f"Found placeholder text: {', '.join(found_placeholders)}")
return False
return True
def _check_content_truncation(self) -> bool:
"""Check for content truncation patterns (2025 Progressive Assembly enhancement)"""
truncation_patterns = [
(r'Content continues', 'Phrase "Content continues"'),
(r'Due to length', 'Phrase "Due to length"'),
(r'would continue', 'Phrase "would continue"'),
(r'\[Sections \d+-\d+', 'Pattern "[Sections X-Y"'),
(r'Additional sections', 'Phrase "Additional sections"'),
(r'comprehensive.*word document that continues', 'Pattern "comprehensive...document that continues"'),
]
for pattern_re, description in truncation_patterns:
if re.search(pattern_re, self.content, re.IGNORECASE):
self.errors.append(f"⚠️ CRITICAL: Content truncation detected: {description}")
self.errors.append(f" Report is INCOMPLETE and UNUSABLE - regenerate with progressive assembly")
return False
return True
def _check_word_count(self) -> bool:
"""Check overall report length"""
word_count = len(self.content.split())
if word_count < 500:
self.warnings.append(f"Report is very short: {word_count} words (consider expanding)")
# No upper limit warning - progressive assembly supports unlimited lengths
return True
def _check_source_count(self) -> bool:
"""Check minimum source count"""
pattern = r'## Bibliography(.*?)(?=##|\Z)'
match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE)
if not match:
return True # Already caught in bibliography check
bib_section = match.group(1)
bib_entries = re.findall(r'^\[(\d+)\]', bib_section, re.MULTILINE)
source_count = len(set(bib_entries))
if source_count < 10:
self.warnings.append(f"Only {source_count} sources (recommended: ≥10)")
return True
def _check_broken_references(self) -> bool:
"""Check for broken internal references"""
# Find all markdown links [text](./path)
internal_links = re.findall(r'\[.*?\]\((\.\/.*?)\)', self.content)
broken = []
for link in internal_links:
# Remove anchor if present
link_path = link.split('#')[0]
full_path = self.report_path.parent / link_path
if not full_path.exists():
broken.append(link)
if broken:
self.errors.append(f"Broken internal links: {', '.join(broken)}")
return False
return True
def _print_summary(self):
"""Print validation summary"""
print(f"\n{'='*60}")
print(f"VALIDATION SUMMARY")
print(f"{'='*60}\n")
if self.errors:
print(f"❌ ERRORS ({len(self.errors)}):")
for error in self.errors:
print(f"{error}")
print()
if self.warnings:
print(f"⚠️ WARNINGS ({len(self.warnings)}):")
for warning in self.warnings:
print(f"{warning}")
print()
if not self.errors and not self.warnings:
print("✅ ALL CHECKS PASSED - Report meets quality standards!\n")
elif not self.errors:
print("✅ VALIDATION PASSED (with warnings)\n")
else:
print("❌ VALIDATION FAILED - Please fix errors before delivery\n")
def main():
parser = argparse.ArgumentParser(
description="Validate research report quality",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python validate_report.py --report report.md
python validate_report.py -r ~/.claude/research_output/research_report_20251104_153045.md
"""
)
parser.add_argument(
'--report', '-r',
type=str,
required=True,
help='Path to research report markdown file'
)
args = parser.parse_args()
report_path = Path(args.report)
if not report_path.exists():
print(f"❌ ERROR: Report file not found: {report_path}")
sys.exit(1)
validator = ReportValidator(report_path)
passed = validator.validate()
sys.exit(0 if passed else 1)
if __name__ == '__main__':
main()