Optimized the root .gitignore to exclude virtual environments, node modules, and temp folders to ensure clean and lightweight version tracking. Co-authored-by: Cursor <cursoragent@cursor.com>
579 lines
16 KiB
Python
579 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Deep Research Engine for Claude Code
|
|
Orchestrates comprehensive research across multiple sources with verification and synthesis
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
from dataclasses import dataclass, asdict
|
|
from enum import Enum
|
|
|
|
|
|
class ResearchPhase(Enum):
|
|
"""Research pipeline phases"""
|
|
SCOPE = "scope"
|
|
PLAN = "plan"
|
|
RETRIEVE = "retrieve"
|
|
TRIANGULATE = "triangulate"
|
|
SYNTHESIZE = "synthesize"
|
|
CRITIQUE = "critique"
|
|
REFINE = "refine"
|
|
PACKAGE = "package"
|
|
|
|
|
|
class ResearchMode(Enum):
|
|
"""Research depth modes"""
|
|
QUICK = "quick" # 3 phases: scope, retrieve, package
|
|
STANDARD = "standard" # 6 phases: skip refine and critique
|
|
DEEP = "deep" # Full 8 phases
|
|
ULTRADEEP = "ultradeep" # 8 phases + extended iterations
|
|
|
|
|
|
@dataclass
|
|
class Source:
|
|
"""Represents a research source"""
|
|
url: str
|
|
title: str
|
|
snippet: str
|
|
retrieved_at: str
|
|
credibility_score: float = 0.0
|
|
source_type: str = "web" # web, academic, documentation, code
|
|
verification_status: str = "unverified" # unverified, verified, conflicted
|
|
|
|
def to_citation(self, index: int) -> str:
|
|
"""Generate citation string"""
|
|
return f"[{index}] {self.title} - {self.url} (Retrieved: {self.retrieved_at})"
|
|
|
|
|
|
@dataclass
|
|
class ResearchState:
|
|
"""Maintains research state across phases"""
|
|
query: str
|
|
mode: ResearchMode
|
|
phase: ResearchPhase
|
|
scope: Dict[str, Any]
|
|
plan: Dict[str, Any]
|
|
sources: List[Source]
|
|
findings: List[Dict[str, Any]]
|
|
synthesis: Dict[str, Any]
|
|
critique: Dict[str, Any]
|
|
report: str
|
|
metadata: Dict[str, Any]
|
|
|
|
def save(self, filepath: Path):
|
|
"""Save research state to file with retry logic"""
|
|
max_retries = 3
|
|
for attempt in range(max_retries):
|
|
try:
|
|
with open(filepath, 'w') as f:
|
|
json.dump(self._serialize(), f, indent=2)
|
|
return # Success
|
|
except (IOError, OSError) as e:
|
|
if attempt == max_retries - 1:
|
|
# Final attempt failed
|
|
raise IOError(f"Failed to save state after {max_retries} attempts: {e}")
|
|
# Wait with exponential backoff before retry
|
|
wait_time = (attempt + 1) * 0.5 # 0.5s, 1s, 1.5s
|
|
time.sleep(wait_time)
|
|
|
|
def _serialize(self) -> dict:
|
|
"""Convert to serializable dict"""
|
|
return {
|
|
'query': self.query,
|
|
'mode': self.mode.value,
|
|
'phase': self.phase.value,
|
|
'scope': self.scope,
|
|
'plan': self.plan,
|
|
'sources': [asdict(s) for s in self.sources],
|
|
'findings': self.findings,
|
|
'synthesis': self.synthesis,
|
|
'critique': self.critique,
|
|
'report': self.report,
|
|
'metadata': self.metadata
|
|
}
|
|
|
|
@classmethod
|
|
def load(cls, filepath: Path) -> 'ResearchState':
|
|
"""Load research state from file"""
|
|
with open(filepath, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
return cls(
|
|
query=data['query'],
|
|
mode=ResearchMode(data['mode']),
|
|
phase=ResearchPhase(data['phase']),
|
|
scope=data['scope'],
|
|
plan=data['plan'],
|
|
sources=[Source(**s) for s in data['sources']],
|
|
findings=data['findings'],
|
|
synthesis=data['synthesis'],
|
|
critique=data['critique'],
|
|
report=data['report'],
|
|
metadata=data['metadata']
|
|
)
|
|
|
|
|
|
class ResearchEngine:
|
|
"""Main research orchestration engine"""
|
|
|
|
def __init__(self, mode: ResearchMode = ResearchMode.STANDARD):
|
|
self.mode = mode
|
|
self.state: Optional[ResearchState] = None
|
|
self.output_dir = Path.home() / ".claude" / "research_output"
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
def initialize_research(self, query: str) -> ResearchState:
|
|
"""Initialize new research session"""
|
|
self.state = ResearchState(
|
|
query=query,
|
|
mode=self.mode,
|
|
phase=ResearchPhase.SCOPE,
|
|
scope={},
|
|
plan={},
|
|
sources=[],
|
|
findings=[],
|
|
synthesis={},
|
|
critique={},
|
|
report="",
|
|
metadata={
|
|
'started_at': datetime.now().isoformat(),
|
|
'version': '1.0'
|
|
}
|
|
)
|
|
return self.state
|
|
|
|
def get_phase_instructions(self, phase: ResearchPhase) -> str:
|
|
"""Get instructions for current phase"""
|
|
instructions = {
|
|
ResearchPhase.SCOPE: """
|
|
# Phase 1: SCOPE
|
|
|
|
Your task: Define research boundaries and success criteria
|
|
|
|
## Execute:
|
|
1. Decompose the question into 3-5 core components
|
|
2. Identify 2-4 key stakeholder perspectives
|
|
3. Define what's IN scope and what's OUT of scope
|
|
4. List 3-5 success criteria for this research
|
|
5. Document 3-5 assumptions that need validation
|
|
|
|
## Output Format:
|
|
```json
|
|
{
|
|
"core_components": ["component1", "component2", ...],
|
|
"stakeholder_perspectives": ["perspective1", "perspective2", ...],
|
|
"in_scope": ["item1", "item2", ...],
|
|
"out_of_scope": ["item1", "item2", ...],
|
|
"success_criteria": ["criteria1", "criteria2", ...],
|
|
"assumptions": ["assumption1", "assumption2", ...]
|
|
}
|
|
```
|
|
|
|
Use extended reasoning to explore multiple framings before finalizing scope.
|
|
""",
|
|
ResearchPhase.PLAN: """
|
|
# Phase 2: PLAN
|
|
|
|
Your task: Create intelligent research roadmap
|
|
|
|
## Execute:
|
|
1. Identify 5-10 primary sources to investigate
|
|
2. List 5-10 secondary/backup sources
|
|
3. Map knowledge dependencies (what must be understood first)
|
|
4. Create 10-15 search query variations
|
|
5. Plan triangulation approach (how to verify claims)
|
|
6. Define 3-5 quality gates
|
|
|
|
## Output Format:
|
|
```json
|
|
{
|
|
"primary_sources": ["source_type1", "source_type2", ...],
|
|
"secondary_sources": ["source_type1", "source_type2", ...],
|
|
"knowledge_dependencies": {"concept1": ["prerequisite1", "prerequisite2"], ...},
|
|
"search_queries": ["query1", "query2", ...],
|
|
"triangulation_strategy": "description of verification approach",
|
|
"quality_gates": ["gate1", "gate2", ...]
|
|
}
|
|
```
|
|
|
|
Use Graph-of-Thoughts: branch into 3-4 potential research paths, evaluate, then converge on optimal strategy.
|
|
""",
|
|
ResearchPhase.RETRIEVE: """
|
|
# Phase 3: RETRIEVE
|
|
|
|
Your task: Systematically collect information from multiple sources
|
|
|
|
## Execute:
|
|
1. Use WebSearch with iterative query refinement (minimum 10 searches)
|
|
2. Use WebFetch to deep-dive into 5-10 most promising sources
|
|
3. Extract key passages with metadata
|
|
4. Track information gaps
|
|
5. Follow 2-3 promising tangents
|
|
6. Ensure source diversity (different domains, perspectives)
|
|
|
|
## Tools to Use:
|
|
- WebSearch: For current information and broad coverage
|
|
- WebFetch: For detailed extraction from specific URLs
|
|
- Grep/Read: For local documentation if relevant
|
|
- Task: Spawn 2-3 parallel retrieval agents for efficiency
|
|
|
|
## Output:
|
|
Store all sources with metadata. Each source should include:
|
|
- URL/location
|
|
- Title
|
|
- Key excerpts
|
|
- Relevance score
|
|
- Source type
|
|
- Retrieved timestamp
|
|
|
|
Aim for 15-30 distinct sources minimum.
|
|
""",
|
|
ResearchPhase.TRIANGULATE: """
|
|
# Phase 4: TRIANGULATE
|
|
|
|
Your task: Validate information across multiple independent sources
|
|
|
|
## Execute:
|
|
1. List all major claims from retrieved information
|
|
2. For each claim, find 3+ independent confirmatory sources
|
|
3. Flag any contradictions or uncertainties
|
|
4. Assess source credibility (domain expertise, recency, bias)
|
|
5. Document consensus areas vs. debate areas
|
|
6. Mark verification status for each claim
|
|
|
|
## Quality Standards:
|
|
- Core claims MUST have 3+ independent sources
|
|
- Flag any single-source claims as "unverified"
|
|
- Note information recency
|
|
- Identify potential biases
|
|
|
|
## Output Format:
|
|
```json
|
|
{
|
|
"verified_claims": [
|
|
{
|
|
"claim": "statement",
|
|
"sources": ["source1", "source2", "source3"],
|
|
"confidence": "high|medium|low"
|
|
}
|
|
],
|
|
"unverified_claims": [...],
|
|
"contradictions": [
|
|
{
|
|
"topic": "what's contradicted",
|
|
"viewpoint1": {"claim": "...", "sources": [...]},
|
|
"viewpoint2": {"claim": "...", "sources": [...]}
|
|
}
|
|
]
|
|
}
|
|
```
|
|
""",
|
|
ResearchPhase.SYNTHESIZE: """
|
|
# Phase 5: SYNTHESIZE
|
|
|
|
Your task: Connect insights and generate novel understanding
|
|
|
|
## Execute:
|
|
1. Identify 5-10 key patterns across sources
|
|
2. Map relationships between concepts
|
|
3. Generate 3-5 insights that go beyond source material
|
|
4. Create conceptual frameworks or mental models
|
|
5. Build argument structures
|
|
6. Develop evidence hierarchies
|
|
|
|
## Use Extended Reasoning:
|
|
- Explore non-obvious connections
|
|
- Consider second-order implications
|
|
- Think about what sources might be missing
|
|
- Generate novel hypotheses
|
|
|
|
## Output Format:
|
|
```json
|
|
{
|
|
"patterns": ["pattern1", "pattern2", ...],
|
|
"concept_relationships": {"concept1": ["related_to1", "related_to2"], ...},
|
|
"novel_insights": ["insight1", "insight2", ...],
|
|
"frameworks": ["framework_description1", ...],
|
|
"key_arguments": [
|
|
{
|
|
"argument": "main claim",
|
|
"supporting_evidence": ["evidence1", "evidence2"],
|
|
"strength": "strong|moderate|weak"
|
|
}
|
|
]
|
|
}
|
|
```
|
|
""",
|
|
ResearchPhase.CRITIQUE: """
|
|
# Phase 6: CRITIQUE
|
|
|
|
Your task: Rigorously evaluate research quality
|
|
|
|
## Execute Red Team Analysis:
|
|
1. Check logical consistency
|
|
2. Verify citation completeness
|
|
3. Identify gaps or weaknesses
|
|
4. Assess balance and objectivity
|
|
5. Test alternative interpretations
|
|
6. Challenge assumptions
|
|
|
|
## Red Team Questions:
|
|
- What's missing from this research?
|
|
- What could be wrong?
|
|
- What alternative explanations exist?
|
|
- What biases might be present?
|
|
- What counterfactuals should be considered?
|
|
- What would a skeptic say?
|
|
|
|
## Output Format:
|
|
```json
|
|
{
|
|
"strengths": ["strength1", "strength2", ...],
|
|
"weaknesses": ["weakness1", "weakness2", ...],
|
|
"gaps": ["gap1", "gap2", ...],
|
|
"biases": ["bias1", "bias2", ...],
|
|
"improvements_needed": [
|
|
{
|
|
"issue": "description",
|
|
"recommendation": "how to fix",
|
|
"priority": "high|medium|low"
|
|
}
|
|
]
|
|
}
|
|
```
|
|
""",
|
|
ResearchPhase.REFINE: """
|
|
# Phase 7: REFINE
|
|
|
|
Your task: Address gaps and strengthen weak areas
|
|
|
|
## Execute:
|
|
1. Conduct additional research for identified gaps
|
|
2. Strengthen weak arguments with more evidence
|
|
3. Add missing perspectives
|
|
4. Resolve contradictions where possible
|
|
5. Enhance clarity and structure
|
|
6. Verify all revised content
|
|
|
|
## Focus On:
|
|
- High priority improvements from critique
|
|
- Missing stakeholder perspectives
|
|
- Weak evidence chains
|
|
- Unclear explanations
|
|
|
|
## Output:
|
|
Updated findings, sources, and synthesis with improvements documented.
|
|
""",
|
|
ResearchPhase.PACKAGE: """
|
|
# Phase 8: PACKAGE
|
|
|
|
Your task: Deliver professional, actionable research report
|
|
|
|
## Generate Complete Report:
|
|
|
|
```markdown
|
|
# Research Report: [Topic]
|
|
|
|
## Executive Summary
|
|
[3-5 key findings bullets]
|
|
[Primary recommendation]
|
|
[Confidence level: High/Medium/Low]
|
|
|
|
## Introduction
|
|
### Research Question
|
|
[Original question]
|
|
|
|
### Scope & Methodology
|
|
[What was investigated and how]
|
|
|
|
### Key Assumptions
|
|
[Important assumptions made]
|
|
|
|
## Main Analysis
|
|
|
|
### Finding 1: [Title]
|
|
[Detailed explanation with evidence]
|
|
[Citations: [1], [2], [3]]
|
|
|
|
### Finding 2: [Title]
|
|
[Detailed explanation with evidence]
|
|
[Citations: [4], [5], [6]]
|
|
|
|
[Continue for all findings...]
|
|
|
|
## Synthesis & Insights
|
|
[Patterns and connections]
|
|
[Novel insights]
|
|
[Implications]
|
|
|
|
## Limitations & Caveats
|
|
[Known gaps]
|
|
[Assumptions]
|
|
[Areas of uncertainty]
|
|
|
|
## Recommendations
|
|
[Action items]
|
|
[Next steps]
|
|
[Further research needs]
|
|
|
|
## Bibliography
|
|
[1] Source 1 full citation
|
|
[2] Source 2 full citation
|
|
...
|
|
|
|
## Appendix: Methodology
|
|
[Research process]
|
|
[Sources consulted]
|
|
[Verification approach]
|
|
```
|
|
|
|
Save report to file with timestamp.
|
|
"""
|
|
}
|
|
|
|
return instructions.get(phase, "No instructions available for this phase")
|
|
|
|
def execute_phase(self, phase: ResearchPhase) -> Dict[str, Any]:
|
|
"""Execute a research phase"""
|
|
print(f"\n{'='*80}")
|
|
print(f"PHASE {phase.value.upper()}: Starting...")
|
|
print(f"{'='*80}\n")
|
|
|
|
instructions = self.get_phase_instructions(phase)
|
|
print(instructions)
|
|
|
|
# In real usage, Claude will execute these instructions
|
|
# This returns a structured result that Claude should populate
|
|
result = {
|
|
'phase': phase.value,
|
|
'status': 'instructions_displayed',
|
|
'timestamp': datetime.now().isoformat()
|
|
}
|
|
|
|
return result
|
|
|
|
def run_pipeline(self, query: str) -> str:
|
|
"""Run complete research pipeline"""
|
|
print(f"\n{'#'*80}")
|
|
print(f"# DEEP RESEARCH ENGINE")
|
|
print(f"# Query: {query}")
|
|
print(f"# Mode: {self.mode.value}")
|
|
print(f"{'#'*80}\n")
|
|
|
|
# Initialize research
|
|
self.initialize_research(query)
|
|
|
|
# Determine phases based on mode
|
|
phases = self._get_phases_for_mode()
|
|
|
|
# Execute each phase
|
|
for phase in phases:
|
|
self.state.phase = phase
|
|
result = self.execute_phase(phase)
|
|
|
|
# Save state after each phase
|
|
state_file = self.output_dir / f"research_state_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
self.state.save(state_file)
|
|
print(f"\n✓ Phase {phase.value} complete. State saved to: {state_file}\n")
|
|
|
|
# Generate report path
|
|
report_file = self.output_dir / f"research_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"RESEARCH PIPELINE COMPLETE")
|
|
print(f"Report will be saved to: {report_file}")
|
|
print(f"{'='*80}\n")
|
|
|
|
return str(report_file)
|
|
|
|
def _get_phases_for_mode(self) -> List[ResearchPhase]:
|
|
"""Get phases based on research mode"""
|
|
if self.mode == ResearchMode.QUICK:
|
|
return [
|
|
ResearchPhase.SCOPE,
|
|
ResearchPhase.RETRIEVE,
|
|
ResearchPhase.PACKAGE
|
|
]
|
|
elif self.mode == ResearchMode.STANDARD:
|
|
return [
|
|
ResearchPhase.SCOPE,
|
|
ResearchPhase.PLAN,
|
|
ResearchPhase.RETRIEVE,
|
|
ResearchPhase.TRIANGULATE,
|
|
ResearchPhase.SYNTHESIZE,
|
|
ResearchPhase.PACKAGE
|
|
]
|
|
elif self.mode == ResearchMode.DEEP:
|
|
return list(ResearchPhase)
|
|
elif self.mode == ResearchMode.ULTRADEEP:
|
|
# In ultradeep, we might iterate some phases
|
|
return list(ResearchPhase)
|
|
|
|
return list(ResearchPhase)
|
|
|
|
|
|
def main():
|
|
"""CLI entry point"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Deep Research Engine for Claude Code",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python research_engine.py --query "state of quantum computing 2025" --mode deep
|
|
python research_engine.py --query "PostgreSQL vs Supabase comparison" --mode standard
|
|
python research_engine.py -q "longevity biotech funding trends" -m ultradeep
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--query', '-q',
|
|
type=str,
|
|
required=True,
|
|
help='Research question or topic'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--mode', '-m',
|
|
type=str,
|
|
choices=['quick', 'standard', 'deep', 'ultradeep'],
|
|
default='standard',
|
|
help='Research depth mode (default: standard)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--resume',
|
|
type=str,
|
|
help='Resume from saved state file'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Initialize engine
|
|
mode = ResearchMode(args.mode)
|
|
engine = ResearchEngine(mode=mode)
|
|
|
|
if args.resume:
|
|
# Load previous state
|
|
state_file = Path(args.resume)
|
|
if not state_file.exists():
|
|
print(f"Error: State file not found: {state_file}", file=sys.stderr)
|
|
sys.exit(1)
|
|
engine.state = ResearchState.load(state_file)
|
|
print(f"Resumed research from: {state_file}")
|
|
|
|
# Run pipeline
|
|
report_path = engine.run_pipeline(args.query)
|
|
|
|
print(f"\nResearch complete! Report path: {report_path}")
|
|
print(f"\nNow Claude should execute each phase using the displayed instructions.")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|