Files
ONE-OS/axhub-make/skills/third-party/deep-research/scripts/research_engine.py
王冕 a27e3b8e43 feat: sync full workspace including web modules, docs, and configurations to Gitea
Optimized the root .gitignore to exclude virtual environments, node modules,
and temp folders to ensure clean and lightweight version tracking.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-09 18:12:25 +08:00

579 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Deep Research Engine for Claude Code
Orchestrates comprehensive research across multiple sources with verification and synthesis
"""
import argparse
import json
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
from enum import Enum
class ResearchPhase(Enum):
"""Research pipeline phases"""
SCOPE = "scope"
PLAN = "plan"
RETRIEVE = "retrieve"
TRIANGULATE = "triangulate"
SYNTHESIZE = "synthesize"
CRITIQUE = "critique"
REFINE = "refine"
PACKAGE = "package"
class ResearchMode(Enum):
"""Research depth modes"""
QUICK = "quick" # 3 phases: scope, retrieve, package
STANDARD = "standard" # 6 phases: skip refine and critique
DEEP = "deep" # Full 8 phases
ULTRADEEP = "ultradeep" # 8 phases + extended iterations
@dataclass
class Source:
"""Represents a research source"""
url: str
title: str
snippet: str
retrieved_at: str
credibility_score: float = 0.0
source_type: str = "web" # web, academic, documentation, code
verification_status: str = "unverified" # unverified, verified, conflicted
def to_citation(self, index: int) -> str:
"""Generate citation string"""
return f"[{index}] {self.title} - {self.url} (Retrieved: {self.retrieved_at})"
@dataclass
class ResearchState:
"""Maintains research state across phases"""
query: str
mode: ResearchMode
phase: ResearchPhase
scope: Dict[str, Any]
plan: Dict[str, Any]
sources: List[Source]
findings: List[Dict[str, Any]]
synthesis: Dict[str, Any]
critique: Dict[str, Any]
report: str
metadata: Dict[str, Any]
def save(self, filepath: Path):
"""Save research state to file with retry logic"""
max_retries = 3
for attempt in range(max_retries):
try:
with open(filepath, 'w') as f:
json.dump(self._serialize(), f, indent=2)
return # Success
except (IOError, OSError) as e:
if attempt == max_retries - 1:
# Final attempt failed
raise IOError(f"Failed to save state after {max_retries} attempts: {e}")
# Wait with exponential backoff before retry
wait_time = (attempt + 1) * 0.5 # 0.5s, 1s, 1.5s
time.sleep(wait_time)
def _serialize(self) -> dict:
"""Convert to serializable dict"""
return {
'query': self.query,
'mode': self.mode.value,
'phase': self.phase.value,
'scope': self.scope,
'plan': self.plan,
'sources': [asdict(s) for s in self.sources],
'findings': self.findings,
'synthesis': self.synthesis,
'critique': self.critique,
'report': self.report,
'metadata': self.metadata
}
@classmethod
def load(cls, filepath: Path) -> 'ResearchState':
"""Load research state from file"""
with open(filepath, 'r') as f:
data = json.load(f)
return cls(
query=data['query'],
mode=ResearchMode(data['mode']),
phase=ResearchPhase(data['phase']),
scope=data['scope'],
plan=data['plan'],
sources=[Source(**s) for s in data['sources']],
findings=data['findings'],
synthesis=data['synthesis'],
critique=data['critique'],
report=data['report'],
metadata=data['metadata']
)
class ResearchEngine:
"""Main research orchestration engine"""
def __init__(self, mode: ResearchMode = ResearchMode.STANDARD):
self.mode = mode
self.state: Optional[ResearchState] = None
self.output_dir = Path.home() / ".claude" / "research_output"
self.output_dir.mkdir(parents=True, exist_ok=True)
def initialize_research(self, query: str) -> ResearchState:
"""Initialize new research session"""
self.state = ResearchState(
query=query,
mode=self.mode,
phase=ResearchPhase.SCOPE,
scope={},
plan={},
sources=[],
findings=[],
synthesis={},
critique={},
report="",
metadata={
'started_at': datetime.now().isoformat(),
'version': '1.0'
}
)
return self.state
def get_phase_instructions(self, phase: ResearchPhase) -> str:
"""Get instructions for current phase"""
instructions = {
ResearchPhase.SCOPE: """
# Phase 1: SCOPE
Your task: Define research boundaries and success criteria
## Execute:
1. Decompose the question into 3-5 core components
2. Identify 2-4 key stakeholder perspectives
3. Define what's IN scope and what's OUT of scope
4. List 3-5 success criteria for this research
5. Document 3-5 assumptions that need validation
## Output Format:
```json
{
"core_components": ["component1", "component2", ...],
"stakeholder_perspectives": ["perspective1", "perspective2", ...],
"in_scope": ["item1", "item2", ...],
"out_of_scope": ["item1", "item2", ...],
"success_criteria": ["criteria1", "criteria2", ...],
"assumptions": ["assumption1", "assumption2", ...]
}
```
Use extended reasoning to explore multiple framings before finalizing scope.
""",
ResearchPhase.PLAN: """
# Phase 2: PLAN
Your task: Create intelligent research roadmap
## Execute:
1. Identify 5-10 primary sources to investigate
2. List 5-10 secondary/backup sources
3. Map knowledge dependencies (what must be understood first)
4. Create 10-15 search query variations
5. Plan triangulation approach (how to verify claims)
6. Define 3-5 quality gates
## Output Format:
```json
{
"primary_sources": ["source_type1", "source_type2", ...],
"secondary_sources": ["source_type1", "source_type2", ...],
"knowledge_dependencies": {"concept1": ["prerequisite1", "prerequisite2"], ...},
"search_queries": ["query1", "query2", ...],
"triangulation_strategy": "description of verification approach",
"quality_gates": ["gate1", "gate2", ...]
}
```
Use Graph-of-Thoughts: branch into 3-4 potential research paths, evaluate, then converge on optimal strategy.
""",
ResearchPhase.RETRIEVE: """
# Phase 3: RETRIEVE
Your task: Systematically collect information from multiple sources
## Execute:
1. Use WebSearch with iterative query refinement (minimum 10 searches)
2. Use WebFetch to deep-dive into 5-10 most promising sources
3. Extract key passages with metadata
4. Track information gaps
5. Follow 2-3 promising tangents
6. Ensure source diversity (different domains, perspectives)
## Tools to Use:
- WebSearch: For current information and broad coverage
- WebFetch: For detailed extraction from specific URLs
- Grep/Read: For local documentation if relevant
- Task: Spawn 2-3 parallel retrieval agents for efficiency
## Output:
Store all sources with metadata. Each source should include:
- URL/location
- Title
- Key excerpts
- Relevance score
- Source type
- Retrieved timestamp
Aim for 15-30 distinct sources minimum.
""",
ResearchPhase.TRIANGULATE: """
# Phase 4: TRIANGULATE
Your task: Validate information across multiple independent sources
## Execute:
1. List all major claims from retrieved information
2. For each claim, find 3+ independent confirmatory sources
3. Flag any contradictions or uncertainties
4. Assess source credibility (domain expertise, recency, bias)
5. Document consensus areas vs. debate areas
6. Mark verification status for each claim
## Quality Standards:
- Core claims MUST have 3+ independent sources
- Flag any single-source claims as "unverified"
- Note information recency
- Identify potential biases
## Output Format:
```json
{
"verified_claims": [
{
"claim": "statement",
"sources": ["source1", "source2", "source3"],
"confidence": "high|medium|low"
}
],
"unverified_claims": [...],
"contradictions": [
{
"topic": "what's contradicted",
"viewpoint1": {"claim": "...", "sources": [...]},
"viewpoint2": {"claim": "...", "sources": [...]}
}
]
}
```
""",
ResearchPhase.SYNTHESIZE: """
# Phase 5: SYNTHESIZE
Your task: Connect insights and generate novel understanding
## Execute:
1. Identify 5-10 key patterns across sources
2. Map relationships between concepts
3. Generate 3-5 insights that go beyond source material
4. Create conceptual frameworks or mental models
5. Build argument structures
6. Develop evidence hierarchies
## Use Extended Reasoning:
- Explore non-obvious connections
- Consider second-order implications
- Think about what sources might be missing
- Generate novel hypotheses
## Output Format:
```json
{
"patterns": ["pattern1", "pattern2", ...],
"concept_relationships": {"concept1": ["related_to1", "related_to2"], ...},
"novel_insights": ["insight1", "insight2", ...],
"frameworks": ["framework_description1", ...],
"key_arguments": [
{
"argument": "main claim",
"supporting_evidence": ["evidence1", "evidence2"],
"strength": "strong|moderate|weak"
}
]
}
```
""",
ResearchPhase.CRITIQUE: """
# Phase 6: CRITIQUE
Your task: Rigorously evaluate research quality
## Execute Red Team Analysis:
1. Check logical consistency
2. Verify citation completeness
3. Identify gaps or weaknesses
4. Assess balance and objectivity
5. Test alternative interpretations
6. Challenge assumptions
## Red Team Questions:
- What's missing from this research?
- What could be wrong?
- What alternative explanations exist?
- What biases might be present?
- What counterfactuals should be considered?
- What would a skeptic say?
## Output Format:
```json
{
"strengths": ["strength1", "strength2", ...],
"weaknesses": ["weakness1", "weakness2", ...],
"gaps": ["gap1", "gap2", ...],
"biases": ["bias1", "bias2", ...],
"improvements_needed": [
{
"issue": "description",
"recommendation": "how to fix",
"priority": "high|medium|low"
}
]
}
```
""",
ResearchPhase.REFINE: """
# Phase 7: REFINE
Your task: Address gaps and strengthen weak areas
## Execute:
1. Conduct additional research for identified gaps
2. Strengthen weak arguments with more evidence
3. Add missing perspectives
4. Resolve contradictions where possible
5. Enhance clarity and structure
6. Verify all revised content
## Focus On:
- High priority improvements from critique
- Missing stakeholder perspectives
- Weak evidence chains
- Unclear explanations
## Output:
Updated findings, sources, and synthesis with improvements documented.
""",
ResearchPhase.PACKAGE: """
# Phase 8: PACKAGE
Your task: Deliver professional, actionable research report
## Generate Complete Report:
```markdown
# Research Report: [Topic]
## Executive Summary
[3-5 key findings bullets]
[Primary recommendation]
[Confidence level: High/Medium/Low]
## Introduction
### Research Question
[Original question]
### Scope & Methodology
[What was investigated and how]
### Key Assumptions
[Important assumptions made]
## Main Analysis
### Finding 1: [Title]
[Detailed explanation with evidence]
[Citations: [1], [2], [3]]
### Finding 2: [Title]
[Detailed explanation with evidence]
[Citations: [4], [5], [6]]
[Continue for all findings...]
## Synthesis & Insights
[Patterns and connections]
[Novel insights]
[Implications]
## Limitations & Caveats
[Known gaps]
[Assumptions]
[Areas of uncertainty]
## Recommendations
[Action items]
[Next steps]
[Further research needs]
## Bibliography
[1] Source 1 full citation
[2] Source 2 full citation
...
## Appendix: Methodology
[Research process]
[Sources consulted]
[Verification approach]
```
Save report to file with timestamp.
"""
}
return instructions.get(phase, "No instructions available for this phase")
def execute_phase(self, phase: ResearchPhase) -> Dict[str, Any]:
"""Execute a research phase"""
print(f"\n{'='*80}")
print(f"PHASE {phase.value.upper()}: Starting...")
print(f"{'='*80}\n")
instructions = self.get_phase_instructions(phase)
print(instructions)
# In real usage, Claude will execute these instructions
# This returns a structured result that Claude should populate
result = {
'phase': phase.value,
'status': 'instructions_displayed',
'timestamp': datetime.now().isoformat()
}
return result
def run_pipeline(self, query: str) -> str:
"""Run complete research pipeline"""
print(f"\n{'#'*80}")
print(f"# DEEP RESEARCH ENGINE")
print(f"# Query: {query}")
print(f"# Mode: {self.mode.value}")
print(f"{'#'*80}\n")
# Initialize research
self.initialize_research(query)
# Determine phases based on mode
phases = self._get_phases_for_mode()
# Execute each phase
for phase in phases:
self.state.phase = phase
result = self.execute_phase(phase)
# Save state after each phase
state_file = self.output_dir / f"research_state_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
self.state.save(state_file)
print(f"\n✓ Phase {phase.value} complete. State saved to: {state_file}\n")
# Generate report path
report_file = self.output_dir / f"research_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
print(f"\n{'='*80}")
print(f"RESEARCH PIPELINE COMPLETE")
print(f"Report will be saved to: {report_file}")
print(f"{'='*80}\n")
return str(report_file)
def _get_phases_for_mode(self) -> List[ResearchPhase]:
"""Get phases based on research mode"""
if self.mode == ResearchMode.QUICK:
return [
ResearchPhase.SCOPE,
ResearchPhase.RETRIEVE,
ResearchPhase.PACKAGE
]
elif self.mode == ResearchMode.STANDARD:
return [
ResearchPhase.SCOPE,
ResearchPhase.PLAN,
ResearchPhase.RETRIEVE,
ResearchPhase.TRIANGULATE,
ResearchPhase.SYNTHESIZE,
ResearchPhase.PACKAGE
]
elif self.mode == ResearchMode.DEEP:
return list(ResearchPhase)
elif self.mode == ResearchMode.ULTRADEEP:
# In ultradeep, we might iterate some phases
return list(ResearchPhase)
return list(ResearchPhase)
def main():
"""CLI entry point"""
parser = argparse.ArgumentParser(
description="Deep Research Engine for Claude Code",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python research_engine.py --query "state of quantum computing 2025" --mode deep
python research_engine.py --query "PostgreSQL vs Supabase comparison" --mode standard
python research_engine.py -q "longevity biotech funding trends" -m ultradeep
"""
)
parser.add_argument(
'--query', '-q',
type=str,
required=True,
help='Research question or topic'
)
parser.add_argument(
'--mode', '-m',
type=str,
choices=['quick', 'standard', 'deep', 'ultradeep'],
default='standard',
help='Research depth mode (default: standard)'
)
parser.add_argument(
'--resume',
type=str,
help='Resume from saved state file'
)
args = parser.parse_args()
# Initialize engine
mode = ResearchMode(args.mode)
engine = ResearchEngine(mode=mode)
if args.resume:
# Load previous state
state_file = Path(args.resume)
if not state_file.exists():
print(f"Error: State file not found: {state_file}", file=sys.stderr)
sys.exit(1)
engine.state = ResearchState.load(state_file)
print(f"Resumed research from: {state_file}")
# Run pipeline
report_path = engine.run_pipeline(args.query)
print(f"\nResearch complete! Report path: {report_path}")
print(f"\nNow Claude should execute each phase using the displayed instructions.")
if __name__ == '__main__':
main()