feat: sync full workspace including web modules, docs, and configurations to Gitea

Optimized the root .gitignore to exclude virtual environments, node modules,
and temp folders to ensure clean and lightweight version tracking.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
王冕
2026-06-09 18:12:25 +08:00
parent 351688006e
commit a27e3b8e43
1510 changed files with 162044 additions and 1517 deletions

View File

@@ -0,0 +1,177 @@
#!/usr/bin/env python3
"""
Citation Management System
Tracks sources, generates citations, and maintains bibliography
"""
from dataclasses import dataclass, field
from typing import List, Dict, Optional
from datetime import datetime
from urllib.parse import urlparse
import hashlib
@dataclass
class Citation:
"""Represents a single citation"""
id: str
title: str
url: str
authors: Optional[List[str]] = None
publication_date: Optional[str] = None
retrieved_date: str = field(default_factory=lambda: datetime.now().strftime('%Y-%m-%d'))
source_type: str = "web" # web, academic, documentation, book, paper
doi: Optional[str] = None
citation_count: int = 0
def to_apa(self, index: int) -> str:
"""Generate APA format citation"""
author_str = ""
if self.authors:
if len(self.authors) == 1:
author_str = f"{self.authors[0]}."
elif len(self.authors) == 2:
author_str = f"{self.authors[0]} & {self.authors[1]}."
else:
author_str = f"{self.authors[0]} et al."
date_str = f"({self.publication_date})" if self.publication_date else "(n.d.)"
return f"[{index}] {author_str} {date_str}. {self.title}. Retrieved {self.retrieved_date}, from {self.url}"
def to_inline(self, index: int) -> str:
"""Generate inline citation [index]"""
return f"[{index}]"
def to_markdown(self, index: int) -> str:
"""Generate markdown link format"""
return f"[{index}] [{self.title}]({self.url}) (Retrieved: {self.retrieved_date})"
class CitationManager:
"""Manages citations and bibliography"""
def __init__(self):
self.citations: Dict[str, Citation] = {}
self.citation_order: List[str] = []
def add_source(
self,
url: str,
title: str,
authors: Optional[List[str]] = None,
publication_date: Optional[str] = None,
source_type: str = "web",
doi: Optional[str] = None
) -> str:
"""Add a source and return its citation ID"""
# Generate unique ID based on URL
citation_id = hashlib.md5(url.encode()).hexdigest()[:8]
if citation_id not in self.citations:
citation = Citation(
id=citation_id,
title=title,
url=url,
authors=authors,
publication_date=publication_date,
source_type=source_type,
doi=doi
)
self.citations[citation_id] = citation
self.citation_order.append(citation_id)
# Increment citation count
self.citations[citation_id].citation_count += 1
return citation_id
def get_citation_number(self, citation_id: str) -> Optional[int]:
"""Get the citation number for a given ID"""
try:
return self.citation_order.index(citation_id) + 1
except ValueError:
return None
def get_inline_citation(self, citation_id: str) -> str:
"""Get inline citation marker [n]"""
num = self.get_citation_number(citation_id)
return f"[{num}]" if num else "[?]"
def generate_bibliography(self, style: str = "markdown") -> str:
"""Generate full bibliography"""
if style == "markdown":
lines = ["## Bibliography\n"]
for i, citation_id in enumerate(self.citation_order, 1):
citation = self.citations[citation_id]
lines.append(citation.to_markdown(i))
return "\n".join(lines)
elif style == "apa":
lines = ["## Bibliography\n"]
for i, citation_id in enumerate(self.citation_order, 1):
citation = self.citations[citation_id]
lines.append(citation.to_apa(i))
return "\n".join(lines)
return "Unsupported citation style"
def get_statistics(self) -> Dict[str, any]:
"""Get citation statistics"""
return {
'total_sources': len(self.citations),
'total_citations': sum(c.citation_count for c in self.citations.values()),
'source_types': self._count_by_type(),
'most_cited': self._get_most_cited(5),
'uncited': self._get_uncited()
}
def _count_by_type(self) -> Dict[str, int]:
"""Count sources by type"""
counts = {}
for citation in self.citations.values():
counts[citation.source_type] = counts.get(citation.source_type, 0) + 1
return counts
def _get_most_cited(self, n: int = 5) -> List[tuple]:
"""Get most cited sources"""
sorted_citations = sorted(
self.citations.items(),
key=lambda x: x[1].citation_count,
reverse=True
)
return [(self.get_citation_number(cid), c.title, c.citation_count)
for cid, c in sorted_citations[:n]]
def _get_uncited(self) -> List[str]:
"""Get sources that were added but never cited"""
return [c.title for c in self.citations.values() if c.citation_count == 0]
def export_to_file(self, filepath: str, style: str = "markdown"):
"""Export bibliography to file"""
with open(filepath, 'w') as f:
f.write(self.generate_bibliography(style))
# Example usage
if __name__ == '__main__':
manager = CitationManager()
# Add sources
id1 = manager.add_source(
url="https://example.com/article1",
title="Understanding Deep Research",
authors=["Smith, J.", "Johnson, K."],
publication_date="2025"
)
id2 = manager.add_source(
url="https://example.com/article2",
title="AI Research Methods",
source_type="academic"
)
# Use citations
print(f"Inline citation: {manager.get_inline_citation(id1)}")
print(f"\nBibliography:\n{manager.generate_bibliography()}")
print(f"\nStatistics:\n{manager.get_statistics()}")

View File

@@ -0,0 +1,330 @@
#!/usr/bin/env python3
"""
Markdown to HTML converter for research reports
Properly converts markdown sections to HTML while preserving structure and formatting
"""
import re
from typing import Tuple
from pathlib import Path
def convert_markdown_to_html(markdown_text: str) -> Tuple[str, str]:
"""
Convert markdown to HTML in two parts: content and bibliography
Args:
markdown_text: Full markdown report text
Returns:
Tuple of (content_html, bibliography_html)
"""
# Split content and bibliography
parts = markdown_text.split('## Bibliography')
content_md = parts[0]
bibliography_md = parts[1] if len(parts) > 1 else ""
# Convert content (everything except bibliography)
content_html = _convert_content_section(content_md)
# Convert bibliography separately
bibliography_html = _convert_bibliography_section(bibliography_md)
return content_html, bibliography_html
def _convert_content_section(markdown: str) -> str:
"""Convert main content sections to HTML"""
html = markdown
# Remove title and front matter (first ## heading is handled separately)
lines = html.split('\n')
processed_lines = []
skip_until_first_section = True
for line in lines:
# Skip everything until we hit "## Executive Summary" or first major section
if skip_until_first_section:
if line.startswith('## ') and not line.startswith('### '):
skip_until_first_section = False
processed_lines.append(line)
continue
processed_lines.append(line)
html = '\n'.join(processed_lines)
# Convert headers
# ## Section Title → <div class="section"><h2 class="section-title">Section Title</h2></div>
html = re.sub(
r'^## (.+)$',
r'<div class="section"><h2 class="section-title">\1</h2>',
html,
flags=re.MULTILINE
)
# ### Subsection → <h3 class="subsection-title">Subsection</h3>
html = re.sub(
r'^### (.+)$',
r'<h3 class="subsection-title">\1</h3>',
html,
flags=re.MULTILINE
)
# #### Subsubsection → <h4 class="subsubsection-title">Title</h4>
html = re.sub(
r'^#### (.+)$',
r'<h4 class="subsubsection-title">\1</h4>',
html,
flags=re.MULTILINE
)
# Convert **bold** text
html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html)
# Convert *italic* text
html = re.sub(r'\*(.+?)\*', r'<em>\1</em>', html)
# Convert inline code `code`
html = re.sub(r'`(.+?)`', r'<code>\1</code>', html)
# Convert unordered lists
html = _convert_lists(html)
# Convert tables
html = _convert_tables(html)
# Convert paragraphs (wrap non-HTML lines in <p> tags)
html = _convert_paragraphs(html)
# Close all open sections
html = _close_sections(html)
# Wrap executive summary if present
html = html.replace(
'<h2 class="section-title">Executive Summary</h2>',
'<div class="executive-summary"><h2 class="section-title">Executive Summary</h2>'
)
if '<div class="executive-summary">' in html:
# Close executive summary at the next section
html = html.replace(
'</h2>\n<div class="section">',
'</h2></div>\n<div class="section">',
1
)
return html
def _convert_bibliography_section(markdown: str) -> str:
"""Convert bibliography section to HTML"""
if not markdown.strip():
return ""
html = markdown
# Convert each [N] citation to a proper bibliography entry
# Look for patterns like [1] Title - URL
html = re.sub(
r'\[(\d+)\]\s*(.+?)\s*-\s*(https?://[^\s\)]+)',
r'<div class="bib-entry"><span class="bib-number">[\1]</span> <a href="\3" target="_blank">\2</a></div>',
html
)
# Convert any remaining **bold** sections
html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html)
# Wrap in bibliography content div
html = f'<div class="bibliography-content">{html}</div>'
return html
def _convert_lists(html: str) -> str:
"""Convert markdown lists to HTML lists"""
lines = html.split('\n')
result = []
in_list = False
list_level = 0
for i, line in enumerate(lines):
stripped = line.strip()
# Check for unordered list item
if stripped.startswith('- ') or stripped.startswith('* '):
if not in_list:
result.append('<ul>')
in_list = True
list_level = len(line) - len(line.lstrip())
# Get the content after the marker
content = stripped[2:]
result.append(f'<li>{content}</li>')
# Check for ordered list item
elif re.match(r'^\d+\.\s', stripped):
if not in_list:
result.append('<ol>')
in_list = True
list_level = len(line) - len(line.lstrip())
# Get the content after the number and period
content = re.sub(r'^\d+\.\s', '', stripped)
result.append(f'<li>{content}</li>')
else:
# Not a list item
if in_list:
# Check if we're still in the list (indented continuation)
current_level = len(line) - len(line.lstrip())
if current_level > list_level and stripped:
# Continuation of previous list item
if result[-1].endswith('</li>'):
result[-1] = result[-1][:-5] + ' ' + stripped + '</li>'
continue
else:
# End of list
result.append('</ul>' if '<ul>' in '\n'.join(result[-10:]) else '</ol>')
in_list = False
list_level = 0
result.append(line)
# Close any remaining open list
if in_list:
result.append('</ul>' if '<ul>' in '\n'.join(result[-10:]) else '</ol>')
return '\n'.join(result)
def _convert_tables(html: str) -> str:
"""Convert markdown tables to HTML tables"""
lines = html.split('\n')
result = []
in_table = False
for i, line in enumerate(lines):
if '|' in line and line.strip().startswith('|'):
if not in_table:
result.append('<table>')
in_table = True
# This is the header row
cells = [cell.strip() for cell in line.split('|')[1:-1]]
result.append('<thead><tr>')
for cell in cells:
result.append(f'<th>{cell}</th>')
result.append('</tr></thead>')
result.append('<tbody>')
elif '---' in line:
# Skip separator row
continue
else:
# Data row
cells = [cell.strip() for cell in line.split('|')[1:-1]]
result.append('<tr>')
for cell in cells:
result.append(f'<td>{cell}</td>')
result.append('</tr>')
else:
if in_table:
result.append('</tbody></table>')
in_table = False
result.append(line)
if in_table:
result.append('</tbody></table>')
return '\n'.join(result)
def _convert_paragraphs(html: str) -> str:
"""Wrap non-HTML lines in paragraph tags"""
lines = html.split('\n')
result = []
in_paragraph = False
for line in lines:
stripped = line.strip()
# Skip empty lines
if not stripped:
if in_paragraph:
result.append('</p>')
in_paragraph = False
result.append(line)
continue
# Skip lines that are already HTML tags
if (stripped.startswith('<') and stripped.endswith('>')) or \
stripped.startswith('</') or \
'<h' in stripped or '<div' in stripped or '<ul' in stripped or \
'<ol' in stripped or '<li' in stripped or '<table' in stripped or \
'</div>' in stripped or '</ul>' in stripped or '</ol>' in stripped:
if in_paragraph:
result.append('</p>')
in_paragraph = False
result.append(line)
continue
# Regular text line - wrap in paragraph
if not in_paragraph:
result.append('<p>' + line)
in_paragraph = True
else:
result.append(line)
if in_paragraph:
result.append('</p>')
return '\n'.join(result)
def _close_sections(html: str) -> str:
"""Close all open section divs"""
# Count open and closed divs
open_divs = html.count('<div class="section">')
closed_divs = html.count('</div>')
# Add closing divs for sections
# Each section should be closed before the next section starts
lines = html.split('\n')
result = []
section_open = False
for i, line in enumerate(lines):
if '<div class="section">' in line:
if section_open:
result.append('</div>') # Close previous section
section_open = True
result.append(line)
# Close final section if still open
if section_open:
result.append('</div>')
return '\n'.join(result)
def main():
"""Test the converter with a sample markdown file"""
import sys
if len(sys.argv) < 2:
print("Usage: python md_to_html.py <markdown_file>")
sys.exit(1)
md_file = Path(sys.argv[1])
if not md_file.exists():
print(f"Error: File {md_file} not found")
sys.exit(1)
markdown_text = md_file.read_text()
content_html, bib_html = convert_markdown_to_html(markdown_text)
print("=== CONTENT HTML ===")
print(content_html[:1000])
print("\n=== BIBLIOGRAPHY HTML ===")
print(bib_html[:500])
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,578 @@
#!/usr/bin/env python3
"""
Deep Research Engine for Claude Code
Orchestrates comprehensive research across multiple sources with verification and synthesis
"""
import argparse
import json
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
from enum import Enum
class ResearchPhase(Enum):
"""Research pipeline phases"""
SCOPE = "scope"
PLAN = "plan"
RETRIEVE = "retrieve"
TRIANGULATE = "triangulate"
SYNTHESIZE = "synthesize"
CRITIQUE = "critique"
REFINE = "refine"
PACKAGE = "package"
class ResearchMode(Enum):
"""Research depth modes"""
QUICK = "quick" # 3 phases: scope, retrieve, package
STANDARD = "standard" # 6 phases: skip refine and critique
DEEP = "deep" # Full 8 phases
ULTRADEEP = "ultradeep" # 8 phases + extended iterations
@dataclass
class Source:
"""Represents a research source"""
url: str
title: str
snippet: str
retrieved_at: str
credibility_score: float = 0.0
source_type: str = "web" # web, academic, documentation, code
verification_status: str = "unverified" # unverified, verified, conflicted
def to_citation(self, index: int) -> str:
"""Generate citation string"""
return f"[{index}] {self.title} - {self.url} (Retrieved: {self.retrieved_at})"
@dataclass
class ResearchState:
"""Maintains research state across phases"""
query: str
mode: ResearchMode
phase: ResearchPhase
scope: Dict[str, Any]
plan: Dict[str, Any]
sources: List[Source]
findings: List[Dict[str, Any]]
synthesis: Dict[str, Any]
critique: Dict[str, Any]
report: str
metadata: Dict[str, Any]
def save(self, filepath: Path):
"""Save research state to file with retry logic"""
max_retries = 3
for attempt in range(max_retries):
try:
with open(filepath, 'w') as f:
json.dump(self._serialize(), f, indent=2)
return # Success
except (IOError, OSError) as e:
if attempt == max_retries - 1:
# Final attempt failed
raise IOError(f"Failed to save state after {max_retries} attempts: {e}")
# Wait with exponential backoff before retry
wait_time = (attempt + 1) * 0.5 # 0.5s, 1s, 1.5s
time.sleep(wait_time)
def _serialize(self) -> dict:
"""Convert to serializable dict"""
return {
'query': self.query,
'mode': self.mode.value,
'phase': self.phase.value,
'scope': self.scope,
'plan': self.plan,
'sources': [asdict(s) for s in self.sources],
'findings': self.findings,
'synthesis': self.synthesis,
'critique': self.critique,
'report': self.report,
'metadata': self.metadata
}
@classmethod
def load(cls, filepath: Path) -> 'ResearchState':
"""Load research state from file"""
with open(filepath, 'r') as f:
data = json.load(f)
return cls(
query=data['query'],
mode=ResearchMode(data['mode']),
phase=ResearchPhase(data['phase']),
scope=data['scope'],
plan=data['plan'],
sources=[Source(**s) for s in data['sources']],
findings=data['findings'],
synthesis=data['synthesis'],
critique=data['critique'],
report=data['report'],
metadata=data['metadata']
)
class ResearchEngine:
"""Main research orchestration engine"""
def __init__(self, mode: ResearchMode = ResearchMode.STANDARD):
self.mode = mode
self.state: Optional[ResearchState] = None
self.output_dir = Path.home() / ".claude" / "research_output"
self.output_dir.mkdir(parents=True, exist_ok=True)
def initialize_research(self, query: str) -> ResearchState:
"""Initialize new research session"""
self.state = ResearchState(
query=query,
mode=self.mode,
phase=ResearchPhase.SCOPE,
scope={},
plan={},
sources=[],
findings=[],
synthesis={},
critique={},
report="",
metadata={
'started_at': datetime.now().isoformat(),
'version': '1.0'
}
)
return self.state
def get_phase_instructions(self, phase: ResearchPhase) -> str:
"""Get instructions for current phase"""
instructions = {
ResearchPhase.SCOPE: """
# Phase 1: SCOPE
Your task: Define research boundaries and success criteria
## Execute:
1. Decompose the question into 3-5 core components
2. Identify 2-4 key stakeholder perspectives
3. Define what's IN scope and what's OUT of scope
4. List 3-5 success criteria for this research
5. Document 3-5 assumptions that need validation
## Output Format:
```json
{
"core_components": ["component1", "component2", ...],
"stakeholder_perspectives": ["perspective1", "perspective2", ...],
"in_scope": ["item1", "item2", ...],
"out_of_scope": ["item1", "item2", ...],
"success_criteria": ["criteria1", "criteria2", ...],
"assumptions": ["assumption1", "assumption2", ...]
}
```
Use extended reasoning to explore multiple framings before finalizing scope.
""",
ResearchPhase.PLAN: """
# Phase 2: PLAN
Your task: Create intelligent research roadmap
## Execute:
1. Identify 5-10 primary sources to investigate
2. List 5-10 secondary/backup sources
3. Map knowledge dependencies (what must be understood first)
4. Create 10-15 search query variations
5. Plan triangulation approach (how to verify claims)
6. Define 3-5 quality gates
## Output Format:
```json
{
"primary_sources": ["source_type1", "source_type2", ...],
"secondary_sources": ["source_type1", "source_type2", ...],
"knowledge_dependencies": {"concept1": ["prerequisite1", "prerequisite2"], ...},
"search_queries": ["query1", "query2", ...],
"triangulation_strategy": "description of verification approach",
"quality_gates": ["gate1", "gate2", ...]
}
```
Use Graph-of-Thoughts: branch into 3-4 potential research paths, evaluate, then converge on optimal strategy.
""",
ResearchPhase.RETRIEVE: """
# Phase 3: RETRIEVE
Your task: Systematically collect information from multiple sources
## Execute:
1. Use WebSearch with iterative query refinement (minimum 10 searches)
2. Use WebFetch to deep-dive into 5-10 most promising sources
3. Extract key passages with metadata
4. Track information gaps
5. Follow 2-3 promising tangents
6. Ensure source diversity (different domains, perspectives)
## Tools to Use:
- WebSearch: For current information and broad coverage
- WebFetch: For detailed extraction from specific URLs
- Grep/Read: For local documentation if relevant
- Task: Spawn 2-3 parallel retrieval agents for efficiency
## Output:
Store all sources with metadata. Each source should include:
- URL/location
- Title
- Key excerpts
- Relevance score
- Source type
- Retrieved timestamp
Aim for 15-30 distinct sources minimum.
""",
ResearchPhase.TRIANGULATE: """
# Phase 4: TRIANGULATE
Your task: Validate information across multiple independent sources
## Execute:
1. List all major claims from retrieved information
2. For each claim, find 3+ independent confirmatory sources
3. Flag any contradictions or uncertainties
4. Assess source credibility (domain expertise, recency, bias)
5. Document consensus areas vs. debate areas
6. Mark verification status for each claim
## Quality Standards:
- Core claims MUST have 3+ independent sources
- Flag any single-source claims as "unverified"
- Note information recency
- Identify potential biases
## Output Format:
```json
{
"verified_claims": [
{
"claim": "statement",
"sources": ["source1", "source2", "source3"],
"confidence": "high|medium|low"
}
],
"unverified_claims": [...],
"contradictions": [
{
"topic": "what's contradicted",
"viewpoint1": {"claim": "...", "sources": [...]},
"viewpoint2": {"claim": "...", "sources": [...]}
}
]
}
```
""",
ResearchPhase.SYNTHESIZE: """
# Phase 5: SYNTHESIZE
Your task: Connect insights and generate novel understanding
## Execute:
1. Identify 5-10 key patterns across sources
2. Map relationships between concepts
3. Generate 3-5 insights that go beyond source material
4. Create conceptual frameworks or mental models
5. Build argument structures
6. Develop evidence hierarchies
## Use Extended Reasoning:
- Explore non-obvious connections
- Consider second-order implications
- Think about what sources might be missing
- Generate novel hypotheses
## Output Format:
```json
{
"patterns": ["pattern1", "pattern2", ...],
"concept_relationships": {"concept1": ["related_to1", "related_to2"], ...},
"novel_insights": ["insight1", "insight2", ...],
"frameworks": ["framework_description1", ...],
"key_arguments": [
{
"argument": "main claim",
"supporting_evidence": ["evidence1", "evidence2"],
"strength": "strong|moderate|weak"
}
]
}
```
""",
ResearchPhase.CRITIQUE: """
# Phase 6: CRITIQUE
Your task: Rigorously evaluate research quality
## Execute Red Team Analysis:
1. Check logical consistency
2. Verify citation completeness
3. Identify gaps or weaknesses
4. Assess balance and objectivity
5. Test alternative interpretations
6. Challenge assumptions
## Red Team Questions:
- What's missing from this research?
- What could be wrong?
- What alternative explanations exist?
- What biases might be present?
- What counterfactuals should be considered?
- What would a skeptic say?
## Output Format:
```json
{
"strengths": ["strength1", "strength2", ...],
"weaknesses": ["weakness1", "weakness2", ...],
"gaps": ["gap1", "gap2", ...],
"biases": ["bias1", "bias2", ...],
"improvements_needed": [
{
"issue": "description",
"recommendation": "how to fix",
"priority": "high|medium|low"
}
]
}
```
""",
ResearchPhase.REFINE: """
# Phase 7: REFINE
Your task: Address gaps and strengthen weak areas
## Execute:
1. Conduct additional research for identified gaps
2. Strengthen weak arguments with more evidence
3. Add missing perspectives
4. Resolve contradictions where possible
5. Enhance clarity and structure
6. Verify all revised content
## Focus On:
- High priority improvements from critique
- Missing stakeholder perspectives
- Weak evidence chains
- Unclear explanations
## Output:
Updated findings, sources, and synthesis with improvements documented.
""",
ResearchPhase.PACKAGE: """
# Phase 8: PACKAGE
Your task: Deliver professional, actionable research report
## Generate Complete Report:
```markdown
# Research Report: [Topic]
## Executive Summary
[3-5 key findings bullets]
[Primary recommendation]
[Confidence level: High/Medium/Low]
## Introduction
### Research Question
[Original question]
### Scope & Methodology
[What was investigated and how]
### Key Assumptions
[Important assumptions made]
## Main Analysis
### Finding 1: [Title]
[Detailed explanation with evidence]
[Citations: [1], [2], [3]]
### Finding 2: [Title]
[Detailed explanation with evidence]
[Citations: [4], [5], [6]]
[Continue for all findings...]
## Synthesis & Insights
[Patterns and connections]
[Novel insights]
[Implications]
## Limitations & Caveats
[Known gaps]
[Assumptions]
[Areas of uncertainty]
## Recommendations
[Action items]
[Next steps]
[Further research needs]
## Bibliography
[1] Source 1 full citation
[2] Source 2 full citation
...
## Appendix: Methodology
[Research process]
[Sources consulted]
[Verification approach]
```
Save report to file with timestamp.
"""
}
return instructions.get(phase, "No instructions available for this phase")
def execute_phase(self, phase: ResearchPhase) -> Dict[str, Any]:
"""Execute a research phase"""
print(f"\n{'='*80}")
print(f"PHASE {phase.value.upper()}: Starting...")
print(f"{'='*80}\n")
instructions = self.get_phase_instructions(phase)
print(instructions)
# In real usage, Claude will execute these instructions
# This returns a structured result that Claude should populate
result = {
'phase': phase.value,
'status': 'instructions_displayed',
'timestamp': datetime.now().isoformat()
}
return result
def run_pipeline(self, query: str) -> str:
"""Run complete research pipeline"""
print(f"\n{'#'*80}")
print(f"# DEEP RESEARCH ENGINE")
print(f"# Query: {query}")
print(f"# Mode: {self.mode.value}")
print(f"{'#'*80}\n")
# Initialize research
self.initialize_research(query)
# Determine phases based on mode
phases = self._get_phases_for_mode()
# Execute each phase
for phase in phases:
self.state.phase = phase
result = self.execute_phase(phase)
# Save state after each phase
state_file = self.output_dir / f"research_state_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
self.state.save(state_file)
print(f"\n✓ Phase {phase.value} complete. State saved to: {state_file}\n")
# Generate report path
report_file = self.output_dir / f"research_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
print(f"\n{'='*80}")
print(f"RESEARCH PIPELINE COMPLETE")
print(f"Report will be saved to: {report_file}")
print(f"{'='*80}\n")
return str(report_file)
def _get_phases_for_mode(self) -> List[ResearchPhase]:
"""Get phases based on research mode"""
if self.mode == ResearchMode.QUICK:
return [
ResearchPhase.SCOPE,
ResearchPhase.RETRIEVE,
ResearchPhase.PACKAGE
]
elif self.mode == ResearchMode.STANDARD:
return [
ResearchPhase.SCOPE,
ResearchPhase.PLAN,
ResearchPhase.RETRIEVE,
ResearchPhase.TRIANGULATE,
ResearchPhase.SYNTHESIZE,
ResearchPhase.PACKAGE
]
elif self.mode == ResearchMode.DEEP:
return list(ResearchPhase)
elif self.mode == ResearchMode.ULTRADEEP:
# In ultradeep, we might iterate some phases
return list(ResearchPhase)
return list(ResearchPhase)
def main():
"""CLI entry point"""
parser = argparse.ArgumentParser(
description="Deep Research Engine for Claude Code",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python research_engine.py --query "state of quantum computing 2025" --mode deep
python research_engine.py --query "PostgreSQL vs Supabase comparison" --mode standard
python research_engine.py -q "longevity biotech funding trends" -m ultradeep
"""
)
parser.add_argument(
'--query', '-q',
type=str,
required=True,
help='Research question or topic'
)
parser.add_argument(
'--mode', '-m',
type=str,
choices=['quick', 'standard', 'deep', 'ultradeep'],
default='standard',
help='Research depth mode (default: standard)'
)
parser.add_argument(
'--resume',
type=str,
help='Resume from saved state file'
)
args = parser.parse_args()
# Initialize engine
mode = ResearchMode(args.mode)
engine = ResearchEngine(mode=mode)
if args.resume:
# Load previous state
state_file = Path(args.resume)
if not state_file.exists():
print(f"Error: State file not found: {state_file}", file=sys.stderr)
sys.exit(1)
engine.state = ResearchState.load(state_file)
print(f"Resumed research from: {state_file}")
# Run pipeline
report_path = engine.run_pipeline(args.query)
print(f"\nResearch complete! Report path: {report_path}")
print(f"\nNow Claude should execute each phase using the displayed instructions.")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,292 @@
#!/usr/bin/env python3
"""
Source Credibility Evaluator
Assesses source quality, credibility, and potential biases
"""
from dataclasses import dataclass
from typing import List, Dict, Optional
from urllib.parse import urlparse
from datetime import datetime, timedelta
import re
@dataclass
class CredibilityScore:
"""Represents source credibility assessment"""
overall_score: float # 0-100
domain_authority: float # 0-100
recency: float # 0-100
expertise: float # 0-100
bias_score: float # 0-100 (higher = more neutral)
factors: Dict[str, str]
recommendation: str # "high_trust", "moderate_trust", "low_trust", "verify"
class SourceEvaluator:
"""Evaluates source credibility and quality"""
# Domain reputation tiers
HIGH_AUTHORITY_DOMAINS = {
# Academic & Research
'arxiv.org', 'nature.com', 'science.org', 'cell.com', 'nejm.org',
'thelancet.com', 'springer.com', 'sciencedirect.com', 'plos.org',
'ieee.org', 'acm.org', 'pubmed.ncbi.nlm.nih.gov',
# Government & International Organizations
'nih.gov', 'cdc.gov', 'who.int', 'fda.gov', 'nasa.gov',
'gov.uk', 'europa.eu', 'un.org',
# Established Tech Documentation
'docs.python.org', 'developer.mozilla.org', 'docs.microsoft.com',
'cloud.google.com', 'aws.amazon.com', 'kubernetes.io',
# Reputable News (Fact-check verified)
'reuters.com', 'apnews.com', 'bbc.com', 'economist.com',
'nature.com/news', 'scientificamerican.com'
}
MODERATE_AUTHORITY_DOMAINS = {
# Tech News & Analysis
'techcrunch.com', 'theverge.com', 'arstechnica.com', 'wired.com',
'zdnet.com', 'cnet.com',
# Industry Publications
'forbes.com', 'bloomberg.com', 'wsj.com', 'ft.com',
# Educational
'wikipedia.org', 'britannica.com', 'khanacademy.org',
# Tech Blogs (established)
'medium.com', 'dev.to', 'stackoverflow.com', 'github.com'
}
LOW_AUTHORITY_INDICATORS = [
'blogspot.com', 'wordpress.com', 'wix.com', 'substack.com'
]
def __init__(self):
pass
def evaluate_source(
self,
url: str,
title: str,
content: Optional[str] = None,
publication_date: Optional[str] = None,
author: Optional[str] = None
) -> CredibilityScore:
"""Evaluate source credibility"""
domain = self._extract_domain(url)
# Calculate component scores
domain_score = self._evaluate_domain_authority(domain)
recency_score = self._evaluate_recency(publication_date)
expertise_score = self._evaluate_expertise(domain, title, author)
bias_score = self._evaluate_bias(domain, title, content)
# Calculate overall score (weighted average)
overall = (
domain_score * 0.35 +
recency_score * 0.20 +
expertise_score * 0.25 +
bias_score * 0.20
)
# Determine factors
factors = self._identify_factors(
domain, domain_score, recency_score, expertise_score, bias_score
)
# Generate recommendation
recommendation = self._generate_recommendation(overall)
return CredibilityScore(
overall_score=round(overall, 2),
domain_authority=round(domain_score, 2),
recency=round(recency_score, 2),
expertise=round(expertise_score, 2),
bias_score=round(bias_score, 2),
factors=factors,
recommendation=recommendation
)
def _extract_domain(self, url: str) -> str:
"""Extract domain from URL"""
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Remove www prefix
domain = domain.replace('www.', '')
return domain
def _evaluate_domain_authority(self, domain: str) -> float:
"""Evaluate domain authority (0-100)"""
if domain in self.HIGH_AUTHORITY_DOMAINS:
return 90.0
elif domain in self.MODERATE_AUTHORITY_DOMAINS:
return 70.0
elif any(indicator in domain for indicator in self.LOW_AUTHORITY_INDICATORS):
return 40.0
else:
# Unknown domain - moderate skepticism
return 55.0
def _evaluate_recency(self, publication_date: Optional[str]) -> float:
"""Evaluate information recency (0-100)"""
if not publication_date:
return 50.0 # Unknown date
try:
pub_date = datetime.fromisoformat(publication_date.replace('Z', '+00:00'))
age = datetime.now() - pub_date
# Recency scoring
if age < timedelta(days=90): # < 3 months
return 100.0
elif age < timedelta(days=365): # < 1 year
return 85.0
elif age < timedelta(days=730): # < 2 years
return 70.0
elif age < timedelta(days=1825): # < 5 years
return 50.0
else:
return 30.0
except Exception:
return 50.0
def _evaluate_expertise(
self,
domain: str,
title: str,
author: Optional[str]
) -> float:
"""Evaluate source expertise (0-100)"""
score = 50.0
# Academic/research domains get high expertise
if any(d in domain for d in ['arxiv', 'nature', 'science', 'ieee', 'acm']):
score += 30
# Government/official sources
if '.gov' in domain or 'who.int' in domain:
score += 25
# Technical documentation
if 'docs.' in domain or 'documentation' in title.lower():
score += 20
# Author credentials (if available)
if author:
if any(title in author.lower() for title in ['dr.', 'phd', 'professor']):
score += 15
return min(score, 100.0)
def _evaluate_bias(
self,
domain: str,
title: str,
content: Optional[str]
) -> float:
"""Evaluate potential bias (0-100, higher = more neutral)"""
score = 70.0 # Start neutral
# Check for sensationalism in title
sensational_indicators = [
'!', 'shocking', 'unbelievable', 'you won\'t believe',
'secret', 'they don\'t want you to know'
]
title_lower = title.lower()
if any(indicator in title_lower for indicator in sensational_indicators):
score -= 20
# Academic sources are typically less biased
if any(d in domain for d in ['arxiv', 'nature', 'science', 'ieee']):
score += 20
# Check for balance in content (if available)
if content:
# Look for balanced language
balanced_indicators = ['however', 'although', 'on the other hand', 'critics argue']
if any(indicator in content.lower() for indicator in balanced_indicators):
score += 10
return min(max(score, 0), 100.0)
def _identify_factors(
self,
domain: str,
domain_score: float,
recency_score: float,
expertise_score: float,
bias_score: float
) -> Dict[str, str]:
"""Identify key credibility factors"""
factors = {}
if domain_score >= 85:
factors['domain'] = "High authority domain"
elif domain_score <= 45:
factors['domain'] = "Low authority domain - verify claims"
if recency_score >= 85:
factors['recency'] = "Recent information"
elif recency_score <= 40:
factors['recency'] = "Outdated information - verify currency"
if expertise_score >= 80:
factors['expertise'] = "Expert source"
elif expertise_score <= 45:
factors['expertise'] = "Limited expertise indicators"
if bias_score >= 80:
factors['bias'] = "Balanced perspective"
elif bias_score <= 50:
factors['bias'] = "Potential bias detected"
return factors
def _generate_recommendation(self, overall_score: float) -> str:
"""Generate trust recommendation"""
if overall_score >= 80:
return "high_trust"
elif overall_score >= 60:
return "moderate_trust"
elif overall_score >= 40:
return "low_trust"
else:
return "verify"
# Example usage
if __name__ == '__main__':
evaluator = SourceEvaluator()
# Test sources
test_sources = [
{
'url': 'https://www.nature.com/articles/s41586-2025-12345',
'title': 'Breakthrough in Quantum Computing',
'publication_date': '2025-10-15'
},
{
'url': 'https://someblog.wordpress.com/shocking-discovery',
'title': 'SHOCKING! You Won\'t Believe This Discovery!',
'publication_date': '2020-01-01'
},
{
'url': 'https://docs.python.org/3/library/asyncio.html',
'title': 'asyncio — Asynchronous I/O',
'publication_date': '2025-11-01'
}
]
for source in test_sources:
score = evaluator.evaluate_source(**source)
print(f"\nSource: {source['title']}")
print(f"URL: {source['url']}")
print(f"Overall Score: {score.overall_score}/100")
print(f"Recommendation: {score.recommendation}")
print(f"Factors: {score.factors}")

View File

@@ -0,0 +1,354 @@
#!/usr/bin/env python3
"""
Report Validation Script
Ensures research reports meet quality standards before delivery
"""
import argparse
import re
import sys
from pathlib import Path
from typing import List, Tuple, Dict
class ReportValidator:
"""Validates research report quality"""
def __init__(self, report_path: Path):
self.report_path = report_path
self.content = self._read_report()
self.errors: List[str] = []
self.warnings: List[str] = []
def _read_report(self) -> str:
"""Read report file"""
try:
with open(self.report_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
print(f"❌ ERROR: Cannot read report: {e}")
sys.exit(1)
def validate(self) -> bool:
"""Run all validation checks"""
print(f"\n{'='*60}")
print(f"VALIDATING REPORT: {self.report_path.name}")
print(f"{'='*60}\n")
checks = [
("Executive Summary", self._check_executive_summary),
("Required Sections", self._check_required_sections),
("Citations", self._check_citations),
("Bibliography", self._check_bibliography),
("Placeholder Text", self._check_placeholders),
("Content Truncation", self._check_content_truncation),
("Word Count", self._check_word_count),
("Source Count", self._check_source_count),
("Broken Links", self._check_broken_references),
]
for check_name, check_func in checks:
print(f"⏳ Checking: {check_name}...", end=" ")
passed = check_func()
if passed:
print("✅ PASS")
else:
print("❌ FAIL")
self._print_summary()
return len(self.errors) == 0
def _check_executive_summary(self) -> bool:
"""Check executive summary exists and is under 250 words"""
pattern = r'## Executive Summary(.*?)(?=##|\Z)'
match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE)
if not match:
self.errors.append("Missing 'Executive Summary' section")
return False
summary = match.group(1).strip()
word_count = len(summary.split())
if word_count > 250:
self.warnings.append(f"Executive summary too long: {word_count} words (should be ≤250)")
if word_count < 50:
self.warnings.append(f"Executive summary too short: {word_count} words (should be ≥50)")
return True
def _check_required_sections(self) -> bool:
"""Check all required sections are present"""
required = [
"Executive Summary",
"Introduction",
"Main Analysis",
"Synthesis",
"Limitations",
"Recommendations",
"Bibliography",
"Methodology"
]
# Recommended sections (warnings if missing, not errors)
recommended = [
"Counterevidence Register",
"Claims-Evidence Table"
]
missing = []
for section in required:
if not re.search(rf'##.*{section}', self.content, re.IGNORECASE):
missing.append(section)
if missing:
self.errors.append(f"Missing sections: {', '.join(missing)}")
return False
# Check recommended sections (warnings only)
missing_recommended = []
for section in recommended:
if not re.search(rf'##.*{section}', self.content, re.IGNORECASE):
missing_recommended.append(section)
if missing_recommended:
self.warnings.append(f"Missing recommended sections (for academic rigor): {', '.join(missing_recommended)}")
return True
def _check_citations(self) -> bool:
"""Check citation format and presence"""
# Find all citation references [1], [2], etc.
citations = re.findall(r'\[(\d+)\]', self.content)
if not citations:
self.errors.append("No citations found in report")
return False
unique_citations = set(citations)
if len(unique_citations) < 10:
self.warnings.append(f"Only {len(unique_citations)} unique sources cited (recommended: ≥10)")
# Check for consecutive citation numbers
citation_nums = sorted([int(c) for c in unique_citations])
if citation_nums:
max_citation = max(citation_nums)
expected = set(range(1, max_citation + 1))
missing = expected - set(citation_nums)
if missing:
self.warnings.append(f"Non-consecutive citation numbers, missing: {sorted(missing)}")
return True
def _check_bibliography(self) -> bool:
"""Check bibliography exists, matches citations, and has no truncation placeholders"""
pattern = r'## Bibliography(.*?)(?=##|\Z)'
match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE)
if not match:
self.errors.append("Missing 'Bibliography' section")
return False
bib_section = match.group(1)
# CRITICAL: Check for truncation placeholders (2025 CiteGuard enhancement)
truncation_patterns = [
(r'\[\d+-\d+\]', 'Citation range (e.g., [8-75])'),
(r'Additional.*citations', 'Phrase "Additional citations"'),
(r'would be included', 'Phrase "would be included"'),
(r'\[\.\.\.continue', 'Pattern "[...continue"'),
(r'\[Continue with', 'Pattern "[Continue with"'),
(r'etc\.(?!\w)', 'Standalone "etc."'),
(r'and so on', 'Phrase "and so on"'),
]
for pattern_re, description in truncation_patterns:
if re.search(pattern_re, bib_section, re.IGNORECASE):
self.errors.append(f"⚠️ CRITICAL: Bibliography contains truncation placeholder: {description}")
self.errors.append(f" This makes the report UNUSABLE - complete bibliography required")
return False
# Count bibliography entries [1], [2], etc.
bib_entries = re.findall(r'^\[(\d+)\]', bib_section, re.MULTILINE)
if not bib_entries:
self.errors.append("Bibliography has no entries")
return False
# Check citation number continuity (no gaps)
bib_nums = sorted([int(n) for n in bib_entries])
if bib_nums:
expected = list(range(1, bib_nums[-1] + 1))
actual = bib_nums
missing = [n for n in expected if n not in actual]
if missing:
self.errors.append(f"Bibliography has gaps in numbering: missing {missing}")
return False
# Find citations in text
text_citations = set(re.findall(r'\[(\d+)\]', self.content))
bib_citations = set(bib_entries)
# Check all citations have bibliography entries
missing_in_bib = text_citations - bib_citations
if missing_in_bib:
self.errors.append(f"Citations missing from bibliography: {sorted(missing_in_bib)}")
return False
# Check for unused bibliography entries
unused = bib_citations - text_citations
if unused:
self.warnings.append(f"Unused bibliography entries: {sorted(unused)}")
return True
def _check_placeholders(self) -> bool:
"""Check for placeholder text that shouldn't be in final report"""
placeholders = [
'TBD', 'TODO', 'FIXME', 'XXX',
'[citation needed]', '[needs citation]',
'[placeholder]', '[TODO]', '[TBD]'
]
found_placeholders = []
for placeholder in placeholders:
if placeholder in self.content:
found_placeholders.append(placeholder)
if found_placeholders:
self.errors.append(f"Found placeholder text: {', '.join(found_placeholders)}")
return False
return True
def _check_content_truncation(self) -> bool:
"""Check for content truncation patterns (2025 Progressive Assembly enhancement)"""
truncation_patterns = [
(r'Content continues', 'Phrase "Content continues"'),
(r'Due to length', 'Phrase "Due to length"'),
(r'would continue', 'Phrase "would continue"'),
(r'\[Sections \d+-\d+', 'Pattern "[Sections X-Y"'),
(r'Additional sections', 'Phrase "Additional sections"'),
(r'comprehensive.*word document that continues', 'Pattern "comprehensive...document that continues"'),
]
for pattern_re, description in truncation_patterns:
if re.search(pattern_re, self.content, re.IGNORECASE):
self.errors.append(f"⚠️ CRITICAL: Content truncation detected: {description}")
self.errors.append(f" Report is INCOMPLETE and UNUSABLE - regenerate with progressive assembly")
return False
return True
def _check_word_count(self) -> bool:
"""Check overall report length"""
word_count = len(self.content.split())
if word_count < 500:
self.warnings.append(f"Report is very short: {word_count} words (consider expanding)")
# No upper limit warning - progressive assembly supports unlimited lengths
return True
def _check_source_count(self) -> bool:
"""Check minimum source count"""
pattern = r'## Bibliography(.*?)(?=##|\Z)'
match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE)
if not match:
return True # Already caught in bibliography check
bib_section = match.group(1)
bib_entries = re.findall(r'^\[(\d+)\]', bib_section, re.MULTILINE)
source_count = len(set(bib_entries))
if source_count < 10:
self.warnings.append(f"Only {source_count} sources (recommended: ≥10)")
return True
def _check_broken_references(self) -> bool:
"""Check for broken internal references"""
# Find all markdown links [text](./path)
internal_links = re.findall(r'\[.*?\]\((\.\/.*?)\)', self.content)
broken = []
for link in internal_links:
# Remove anchor if present
link_path = link.split('#')[0]
full_path = self.report_path.parent / link_path
if not full_path.exists():
broken.append(link)
if broken:
self.errors.append(f"Broken internal links: {', '.join(broken)}")
return False
return True
def _print_summary(self):
"""Print validation summary"""
print(f"\n{'='*60}")
print(f"VALIDATION SUMMARY")
print(f"{'='*60}\n")
if self.errors:
print(f"❌ ERRORS ({len(self.errors)}):")
for error in self.errors:
print(f"{error}")
print()
if self.warnings:
print(f"⚠️ WARNINGS ({len(self.warnings)}):")
for warning in self.warnings:
print(f"{warning}")
print()
if not self.errors and not self.warnings:
print("✅ ALL CHECKS PASSED - Report meets quality standards!\n")
elif not self.errors:
print("✅ VALIDATION PASSED (with warnings)\n")
else:
print("❌ VALIDATION FAILED - Please fix errors before delivery\n")
def main():
parser = argparse.ArgumentParser(
description="Validate research report quality",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python validate_report.py --report report.md
python validate_report.py -r ~/.claude/research_output/research_report_20251104_153045.md
"""
)
parser.add_argument(
'--report', '-r',
type=str,
required=True,
help='Path to research report markdown file'
)
args = parser.parse_args()
report_path = Path(args.report)
if not report_path.exists():
print(f"❌ ERROR: Report file not found: {report_path}")
sys.exit(1)
validator = ReportValidator(report_path)
passed = validator.validate()
sys.exit(0 if passed else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,430 @@
#!/usr/bin/env python3
"""
Citation Verification Script (Enhanced with CiteGuard techniques)
Catches fabricated citations by checking:
1. DOI resolution (via doi.org)
2. Basic metadata matching (title similarity, year match)
3. URL accessibility verification
4. Hallucination pattern detection (generic titles, suspicious patterns)
5. Flags suspicious entries for manual review
Enhanced in 2025 with:
- Content alignment checking (when URL available)
- Multi-source verification (DOI + URL + metadata cross-check)
- Advanced hallucination detection patterns
- Better false positive reduction
Usage:
python verify_citations.py --report [path]
python verify_citations.py --report [path] --strict # Fail on any unverified
Does NOT require API keys - uses free DOI resolver and heuristics.
"""
import sys
import argparse
import re
from pathlib import Path
from typing import List, Dict, Tuple
from urllib import request, error
from urllib.parse import quote
import json
import time
class CitationVerifier:
"""Verify citations in research report"""
def __init__(self, report_path: Path, strict_mode: bool = False):
self.report_path = report_path
self.strict_mode = strict_mode
self.content = self._read_report()
self.suspicious = []
self.verified = []
self.errors = []
# Hallucination detection patterns (2025 CiteGuard enhancement)
self.suspicious_patterns = [
# Generic academic-sounding but fake patterns
(r'^(A |An |The )?(Study|Analysis|Review|Survey|Investigation) (of|on|into)',
"Generic academic title pattern"),
(r'^(Recent|Current|Modern|Contemporary) (Advances|Developments|Trends) in',
"Generic 'advances' title pattern"),
# Too perfect, templated titles
(r'^[A-Z][a-z]+ [A-Z][a-z]+: A (Comprehensive|Complete|Systematic) (Review|Analysis|Guide)$',
"Too perfect, templated structure"),
]
def _read_report(self) -> str:
"""Read report file"""
try:
with open(self.report_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
print(f"L ERROR: Cannot read report: {e}")
sys.exit(1)
def extract_bibliography(self) -> List[Dict]:
"""Extract bibliography entries from report"""
pattern = r'## Bibliography(.*?)(?=##|\Z)'
match = re.search(pattern, self.content, re.DOTALL | re.IGNORECASE)
if not match:
self.errors.append("No Bibliography section found")
return []
bib_section = match.group(1)
# Parse entries: [N] Author (Year). "Title". Venue. URL
entries = []
lines = bib_section.strip().split('\n')
current_entry = None
for line in lines:
line = line.strip()
if not line:
continue
# Check if starts with citation number [N]
match_num = re.match(r'^\[(\d+)\]\s+(.+)$', line)
if match_num:
if current_entry:
entries.append(current_entry)
num = match_num.group(1)
rest = match_num.group(2)
# Try to parse: Author (Year). "Title". Venue. URL
year_match = re.search(r'\((\d{4})\)', rest)
title_match = re.search(r'"([^"]+)"', rest)
doi_match = re.search(r'doi\.org/(10\.\S+)', rest)
url_match = re.search(r'https?://[^\s\)]+', rest)
current_entry = {
'num': num,
'raw': rest,
'year': year_match.group(1) if year_match else None,
'title': title_match.group(1) if title_match else None,
'doi': doi_match.group(1) if doi_match else None,
'url': url_match.group(0) if url_match else None
}
elif current_entry:
# Multi-line entry, append to raw
current_entry['raw'] += ' ' + line
if current_entry:
entries.append(current_entry)
return entries
def verify_doi(self, doi: str) -> Tuple[bool, Dict]:
"""
Verify DOI exists and get metadata.
Returns (success, metadata_dict)
"""
if not doi:
return False, {}
try:
# Use content negotiation to get JSON metadata
url = f"https://doi.org/{quote(doi)}"
req = request.Request(url)
req.add_header('Accept', 'application/vnd.citationstyles.csl+json')
with request.urlopen(req, timeout=10) as response:
data = json.loads(response.read().decode('utf-8'))
return True, {
'title': data.get('title', ''),
'year': data.get('issued', {}).get('date-parts', [[None]])[0][0],
'authors': [
f"{a.get('family', '')} {a.get('given', '')}"
for a in data.get('author', [])
],
'venue': data.get('container-title', '')
}
except error.HTTPError as e:
if e.code == 404:
return False, {'error': 'DOI not found (404)'}
return False, {'error': f'HTTP {e.code}'}
except Exception as e:
return False, {'error': str(e)}
def verify_url(self, url: str) -> Tuple[bool, str]:
"""
Verify URL is accessible (2025 CiteGuard enhancement).
Returns (accessible, status_message)
"""
if not url:
return False, "No URL"
try:
# HEAD request to check accessibility without downloading
req = request.Request(url, method='HEAD')
req.add_header('User-Agent', 'Mozilla/5.0 (Research Citation Verifier)')
with request.urlopen(req, timeout=10) as response:
if response.status == 200:
return True, "URL accessible"
else:
return False, f"HTTP {response.status}"
except error.HTTPError as e:
return False, f"HTTP {e.code}"
except error.URLError as e:
return False, f"URL error: {e.reason}"
except Exception as e:
return False, f"Connection error: {str(e)[:50]}"
def detect_hallucination_patterns(self, entry: Dict) -> List[str]:
"""
Detect common LLM hallucination patterns in citations (2025 CiteGuard).
Returns list of detected issues.
"""
issues = []
title = entry.get('title', '')
if not title:
return issues
# Check against suspicious patterns
for pattern, description in self.suspicious_patterns:
if re.match(pattern, title, re.IGNORECASE):
issues.append(f"Suspicious title pattern: {description}")
# Check for overly generic titles
generic_words = ['overview', 'introduction', 'guide', 'handbook', 'manual']
if any(word in title.lower() for word in generic_words) and len(title.split()) < 5:
issues.append("Very generic short title")
# Check for placeholder-like titles
if any(x in title.lower() for x in ['tbd', 'todo', 'placeholder', 'example']):
issues.append("Placeholder text in title")
# Check for inconsistent metadata
if entry.get('year'):
year = int(entry['year'])
# Very recent without DOI or URL is suspicious
if year >= 2024 and not entry.get('doi') and not entry.get('url'):
issues.append("Recent year (2024+) with no verification method")
# Future year is definitely wrong
if year > 2025:
issues.append(f"Future year: {year}")
# Very old with modern phrasing is suspicious
if year < 2000 and any(word in title.lower() for word in ['ai', 'llm', 'gpt', 'transformer']):
issues.append(f"Anachronistic: pre-2000 ({year}) citation mentioning modern AI terms")
return issues
def check_title_similarity(self, title1: str, title2: str) -> float:
"""
Simple title similarity check (word overlap).
Returns score 0.0-1.0
"""
if not title1 or not title2:
return 0.0
# Normalize: lowercase, remove punctuation, split
def normalize(s):
s = s.lower()
s = re.sub(r'[^\w\s]', ' ', s)
return set(s.split())
words1 = normalize(title1)
words2 = normalize(title2)
if not words1 or not words2:
return 0.0
overlap = len(words1 & words2)
total = len(words1 | words2)
return overlap / total if total > 0 else 0.0
def verify_entry(self, entry: Dict) -> Dict:
"""Verify a single bibliography entry (Enhanced 2025 with CiteGuard)"""
result = {
'num': entry['num'],
'status': 'unknown',
'issues': [],
'metadata': {},
'verification_methods': []
}
# STEP 1: Run hallucination detection (CiteGuard 2025)
hallucination_issues = self.detect_hallucination_patterns(entry)
if hallucination_issues:
result['issues'].extend(hallucination_issues)
result['status'] = 'suspicious'
# STEP 2: Has DOI?
if entry['doi']:
print(f" [{entry['num']}] Checking DOI {entry['doi']}...", end=' ')
success, metadata = self.verify_doi(entry['doi'])
if success:
result['metadata'] = metadata
result['status'] = 'verified'
print("")
# Check title similarity if we have both
if entry['title'] and metadata.get('title'):
similarity = self.check_title_similarity(
entry['title'],
metadata['title']
)
if similarity < 0.5:
result['issues'].append(
f"Title mismatch (similarity: {similarity:.1%})"
)
result['status'] = 'suspicious'
# Check year match
if entry['year'] and metadata.get('year'):
if int(entry['year']) != int(metadata['year']):
result['issues'].append(
f"Year mismatch: report says {entry['year']}, DOI says {metadata['year']}"
)
result['status'] = 'suspicious'
else:
print(f"{metadata.get('error', 'Failed')}")
result['status'] = 'unverified'
result['issues'].append(f"DOI resolution failed: {metadata.get('error', 'unknown')}")
# STEP 3: Check URL accessibility (if no DOI or DOI failed)
if entry['url'] and result['status'] != 'verified':
url_ok, url_status = self.verify_url(entry['url'])
if url_ok:
result['verification_methods'].append('URL')
# Upgrade status if URL verifies
if result['status'] in ['unknown', 'no_doi', 'unverified']:
result['status'] = 'url_verified'
print(f" [{entry['num']}] URL accessible ✓")
else:
result['issues'].append(f"URL check failed: {url_status}")
# STEP 4: Final fallback - no verification method
if not entry['doi'] and not entry['url']:
if 'No DOI provided' not in ' '.join(result['issues']):
result['issues'].append("No DOI or URL - cannot verify")
result['status'] = 'suspicious'
return result
def verify_all(self):
"""Verify all bibliography entries"""
print(f"\n{'='*60}")
print(f"CITATION VERIFICATION: {self.report_path.name}")
print(f"{'='*60}\n")
entries = self.extract_bibliography()
if not entries:
print("L No bibliography entries found\n")
return False
print(f"Found {len(entries)} citations\n")
results = []
for entry in entries:
result = self.verify_entry(entry)
results.append(result)
# Rate limiting
time.sleep(0.5)
# Summarize
print(f"\n{'='*60}")
print(f"VERIFICATION SUMMARY")
print(f"{'='*60}\n")
verified = [r for r in results if r['status'] == 'verified']
url_verified = [r for r in results if r['status'] == 'url_verified']
suspicious = [r for r in results if r['status'] == 'suspicious']
unverified = [r for r in results if r['status'] in ['unverified', 'no_doi', 'unknown']]
print(f'DOI Verified: {len(verified)}/{len(results)}')
print(f'URL Verified: {len(url_verified)}/{len(results)}')
print(f'Suspicious: {len(suspicious)}/{len(results)}')
print(f'Unverified: {len(unverified)}/{len(results)}')
print()
if suspicious:
print('SUSPICIOUS CITATIONS (Manual Review Needed):')
for r in suspicious:
print(f"\n [{r['num']}]")
for issue in r['issues']:
print(f" - {issue}")
print()
if unverified and len(unverified) > 0:
print('UNVERIFIED CITATIONS (Could not check):')
for r in unverified:
print(f" [{r['num']}] {r['issues'][0] if r['issues'] else 'Unknown'}")
print()
# Decision (Enhanced 2025 - includes URL-verified as acceptable)
total_verified = len(verified) + len(url_verified)
if suspicious:
print('WARNING: Suspicious citations detected')
if self.strict_mode:
print(' STRICT MODE: Failing due to suspicious citations')
return False
else:
print(' (Continuing in non-strict mode)')
if self.strict_mode and unverified:
print('STRICT MODE: Unverified citations found')
return False
if total_verified / len(results) < 0.5:
print('WARNING: Less than 50% citations verified')
return True # Pass with warning
else:
print('CITATION VERIFICATION PASSED')
return True
def main():
parser = argparse.ArgumentParser(
description="Verify citations in research report",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python verify_citations.py --report report.md
Note: Requires internet connection to check DOIs.
Uses free DOI resolver - no API key needed.
"""
)
parser.add_argument(
'--report', '-r',
type=str,
required=True,
help='Path to research report markdown file'
)
parser.add_argument(
'--strict',
action='store_true',
help='Strict mode: fail on any unverified or suspicious citations'
)
args = parser.parse_args()
report_path = Path(args.report)
if not report_path.exists():
print(f"ERROR: Report file not found: {report_path}")
sys.exit(1)
verifier = CitationVerifier(report_path, strict_mode=args.strict)
passed = verifier.verify_all()
sys.exit(0 if passed else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,220 @@
#!/usr/bin/env python3
"""
HTML Report Verification Script
Validates that HTML reports are properly generated with all sections from MD
"""
import argparse
import re
from pathlib import Path
from typing import List, Tuple
class HTMLVerifier:
"""Verify HTML research reports"""
def __init__(self, html_path: Path, md_path: Path):
self.html_path = html_path
self.md_path = md_path
self.errors = []
self.warnings = []
def verify(self) -> bool:
"""
Run all verification checks
Returns:
True if all checks pass, False otherwise
"""
print(f"\n{'='*60}")
print(f"HTML REPORT VERIFICATION")
print(f"{'='*60}\n")
print(f"HTML File: {self.html_path}")
print(f"MD File: {self.md_path}\n")
# Read files
try:
html_content = self.html_path.read_text()
md_content = self.md_path.read_text()
except Exception as e:
self.errors.append(f"Failed to read files: {e}")
return False
# Run checks
self._check_sections(html_content, md_content)
self._check_no_placeholders(html_content)
self._check_no_emojis(html_content)
self._check_structure(html_content)
self._check_citations(html_content, md_content)
self._check_bibliography(html_content, md_content)
# Report results
self._print_results()
return len(self.errors) == 0
def _check_sections(self, html: str, md: str):
"""Verify all markdown sections are present in HTML"""
# Extract section headings from markdown
md_sections = re.findall(r'^## (.+)$', md, re.MULTILINE)
# Extract sections from HTML
html_sections = re.findall(r'<h2 class="section-title">(.+?)</h2>', html)
# Check if we have placeholder sections like <div class="section">#</div>
placeholder_sections = re.findall(r'<div class="section">#</div>', html)
if placeholder_sections:
self.errors.append(
f"Found {len(placeholder_sections)} placeholder sections (empty '#' divs) - content not converted properly"
)
# Compare section counts
if len(md_sections) > len(html_sections) + 1: # +1 for bibliography which is separate
self.errors.append(
f"Section count mismatch: MD has {len(md_sections)} sections, HTML has only {len(html_sections)} + bibliography"
)
missing = set(md_sections) - set(html_sections)
if missing:
self.errors.append(f"Missing sections in HTML: {missing}")
# Verify Executive Summary is present
if "Executive Summary" in md and "Executive Summary" not in html:
self.errors.append("Executive Summary missing from HTML")
def _check_no_placeholders(self, html: str):
"""Check for common placeholders that shouldn't be in final report"""
placeholders = [
'{{TITLE}}', '{{DATE}}', '{{CONTENT}}', '{{BIBLIOGRAPHY}}',
'{{METRICS_DASHBOARD}}', '{{SOURCE_COUNT}}', 'TODO', 'TBD',
'PLACEHOLDER', 'FIXME'
]
found = []
for placeholder in placeholders:
if placeholder in html:
found.append(placeholder)
if found:
self.errors.append(f"Found unreplaced placeholders: {', '.join(found)}")
def _check_no_emojis(self, html: str):
"""Verify no emojis are present in HTML"""
# Common emoji patterns
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags
"\U00002702-\U000027B0"
"\U000024C2-\U0001F251"
"]+",
flags=re.UNICODE
)
emojis = emoji_pattern.findall(html)
if emojis:
unique_emojis = set(emojis)
self.errors.append(f"Found {len(emojis)} emojis in HTML (should be none): {unique_emojis}")
def _check_structure(self, html: str):
"""Verify HTML has proper structure"""
required_elements = [
('<html', 'HTML tag'),
('<head', 'head tag'),
('<body', 'body tag'),
('<title>', 'title tag'),
('class="header"', 'header section'),
('class="content"', 'content section'),
('class="bibliography"', 'bibliography section'),
]
for element, name in required_elements:
if element not in html:
self.errors.append(f"Missing {name} in HTML")
# Check for unclosed tags (basic check)
open_divs = html.count('<div')
close_divs = html.count('</div>')
if abs(open_divs - close_divs) > 2: # Allow small discrepancy
self.warnings.append(
f"Possible unclosed divs: {open_divs} opening tags, {close_divs} closing tags"
)
def _check_citations(self, html: str, md: str):
"""Verify citations are present"""
# Extract citations from markdown
md_citations = set(re.findall(r'\[(\d+)\]', md))
# Extract citations from HTML (excluding bibliography)
html_content = html.split('class="bibliography"')[0] if 'class="bibliography"' in html else html
html_citations = set(re.findall(r'\[(\d+)\]', html_content))
if len(md_citations) > 0 and len(html_citations) == 0:
self.errors.append("No citations found in HTML content (but present in MD)")
if len(md_citations) > len(html_citations) * 1.5: # Allow some variation
self.warnings.append(
f"Fewer citations in HTML ({len(html_citations)}) than MD ({len(md_citations)})"
)
def _check_bibliography(self, html: str, md: str):
"""Verify bibliography is present and formatted"""
if '## Bibliography' in md:
if 'class="bibliography"' not in html:
self.errors.append("Bibliography section missing from HTML")
elif 'class="bib-entry"' not in html:
self.warnings.append("Bibliography present but entries not properly formatted")
def _print_results(self):
"""Print verification results"""
print(f"\n{'-'*60}")
print("VERIFICATION RESULTS")
print(f"{'-'*60}\n")
if self.errors:
print(f"❌ ERRORS ({len(self.errors)}):")
for i, error in enumerate(self.errors, 1):
print(f" {i}. {error}")
print()
if self.warnings:
print(f"⚠️ WARNINGS ({len(self.warnings)}):")
for i, warning in enumerate(self.warnings, 1):
print(f" {i}. {warning}")
print()
if not self.errors and not self.warnings:
print("✅ All checks passed! HTML report is valid.")
print()
print(f"{'-'*60}\n")
def main():
"""Main entry point"""
parser = argparse.ArgumentParser(description='Verify HTML research report')
parser.add_argument('--html', type=Path, required=True, help='Path to HTML report')
parser.add_argument('--md', type=Path, required=True, help='Path to markdown report')
args = parser.parse_args()
if not args.html.exists():
print(f"Error: HTML file not found: {args.html}")
return 1
if not args.md.exists():
print(f"Error: Markdown file not found: {args.md}")
return 1
verifier = HTMLVerifier(args.html, args.md)
success = verifier.verify()
return 0 if success else 1
if __name__ == "__main__":
exit(main())