Files
王冕 a27e3b8e43 feat: sync full workspace including web modules, docs, and configurations to Gitea
Optimized the root .gitignore to exclude virtual environments, node modules,
and temp folders to ensure clean and lightweight version tracking.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-09 18:12:25 +08:00

293 lines
9.3 KiB
Python

#!/usr/bin/env python3
"""
Source Credibility Evaluator
Assesses source quality, credibility, and potential biases
"""
from dataclasses import dataclass
from typing import List, Dict, Optional
from urllib.parse import urlparse
from datetime import datetime, timedelta
import re
@dataclass
class CredibilityScore:
"""Represents source credibility assessment"""
overall_score: float # 0-100
domain_authority: float # 0-100
recency: float # 0-100
expertise: float # 0-100
bias_score: float # 0-100 (higher = more neutral)
factors: Dict[str, str]
recommendation: str # "high_trust", "moderate_trust", "low_trust", "verify"
class SourceEvaluator:
"""Evaluates source credibility and quality"""
# Domain reputation tiers
HIGH_AUTHORITY_DOMAINS = {
# Academic & Research
'arxiv.org', 'nature.com', 'science.org', 'cell.com', 'nejm.org',
'thelancet.com', 'springer.com', 'sciencedirect.com', 'plos.org',
'ieee.org', 'acm.org', 'pubmed.ncbi.nlm.nih.gov',
# Government & International Organizations
'nih.gov', 'cdc.gov', 'who.int', 'fda.gov', 'nasa.gov',
'gov.uk', 'europa.eu', 'un.org',
# Established Tech Documentation
'docs.python.org', 'developer.mozilla.org', 'docs.microsoft.com',
'cloud.google.com', 'aws.amazon.com', 'kubernetes.io',
# Reputable News (Fact-check verified)
'reuters.com', 'apnews.com', 'bbc.com', 'economist.com',
'nature.com/news', 'scientificamerican.com'
}
MODERATE_AUTHORITY_DOMAINS = {
# Tech News & Analysis
'techcrunch.com', 'theverge.com', 'arstechnica.com', 'wired.com',
'zdnet.com', 'cnet.com',
# Industry Publications
'forbes.com', 'bloomberg.com', 'wsj.com', 'ft.com',
# Educational
'wikipedia.org', 'britannica.com', 'khanacademy.org',
# Tech Blogs (established)
'medium.com', 'dev.to', 'stackoverflow.com', 'github.com'
}
LOW_AUTHORITY_INDICATORS = [
'blogspot.com', 'wordpress.com', 'wix.com', 'substack.com'
]
def __init__(self):
pass
def evaluate_source(
self,
url: str,
title: str,
content: Optional[str] = None,
publication_date: Optional[str] = None,
author: Optional[str] = None
) -> CredibilityScore:
"""Evaluate source credibility"""
domain = self._extract_domain(url)
# Calculate component scores
domain_score = self._evaluate_domain_authority(domain)
recency_score = self._evaluate_recency(publication_date)
expertise_score = self._evaluate_expertise(domain, title, author)
bias_score = self._evaluate_bias(domain, title, content)
# Calculate overall score (weighted average)
overall = (
domain_score * 0.35 +
recency_score * 0.20 +
expertise_score * 0.25 +
bias_score * 0.20
)
# Determine factors
factors = self._identify_factors(
domain, domain_score, recency_score, expertise_score, bias_score
)
# Generate recommendation
recommendation = self._generate_recommendation(overall)
return CredibilityScore(
overall_score=round(overall, 2),
domain_authority=round(domain_score, 2),
recency=round(recency_score, 2),
expertise=round(expertise_score, 2),
bias_score=round(bias_score, 2),
factors=factors,
recommendation=recommendation
)
def _extract_domain(self, url: str) -> str:
"""Extract domain from URL"""
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Remove www prefix
domain = domain.replace('www.', '')
return domain
def _evaluate_domain_authority(self, domain: str) -> float:
"""Evaluate domain authority (0-100)"""
if domain in self.HIGH_AUTHORITY_DOMAINS:
return 90.0
elif domain in self.MODERATE_AUTHORITY_DOMAINS:
return 70.0
elif any(indicator in domain for indicator in self.LOW_AUTHORITY_INDICATORS):
return 40.0
else:
# Unknown domain - moderate skepticism
return 55.0
def _evaluate_recency(self, publication_date: Optional[str]) -> float:
"""Evaluate information recency (0-100)"""
if not publication_date:
return 50.0 # Unknown date
try:
pub_date = datetime.fromisoformat(publication_date.replace('Z', '+00:00'))
age = datetime.now() - pub_date
# Recency scoring
if age < timedelta(days=90): # < 3 months
return 100.0
elif age < timedelta(days=365): # < 1 year
return 85.0
elif age < timedelta(days=730): # < 2 years
return 70.0
elif age < timedelta(days=1825): # < 5 years
return 50.0
else:
return 30.0
except Exception:
return 50.0
def _evaluate_expertise(
self,
domain: str,
title: str,
author: Optional[str]
) -> float:
"""Evaluate source expertise (0-100)"""
score = 50.0
# Academic/research domains get high expertise
if any(d in domain for d in ['arxiv', 'nature', 'science', 'ieee', 'acm']):
score += 30
# Government/official sources
if '.gov' in domain or 'who.int' in domain:
score += 25
# Technical documentation
if 'docs.' in domain or 'documentation' in title.lower():
score += 20
# Author credentials (if available)
if author:
if any(title in author.lower() for title in ['dr.', 'phd', 'professor']):
score += 15
return min(score, 100.0)
def _evaluate_bias(
self,
domain: str,
title: str,
content: Optional[str]
) -> float:
"""Evaluate potential bias (0-100, higher = more neutral)"""
score = 70.0 # Start neutral
# Check for sensationalism in title
sensational_indicators = [
'!', 'shocking', 'unbelievable', 'you won\'t believe',
'secret', 'they don\'t want you to know'
]
title_lower = title.lower()
if any(indicator in title_lower for indicator in sensational_indicators):
score -= 20
# Academic sources are typically less biased
if any(d in domain for d in ['arxiv', 'nature', 'science', 'ieee']):
score += 20
# Check for balance in content (if available)
if content:
# Look for balanced language
balanced_indicators = ['however', 'although', 'on the other hand', 'critics argue']
if any(indicator in content.lower() for indicator in balanced_indicators):
score += 10
return min(max(score, 0), 100.0)
def _identify_factors(
self,
domain: str,
domain_score: float,
recency_score: float,
expertise_score: float,
bias_score: float
) -> Dict[str, str]:
"""Identify key credibility factors"""
factors = {}
if domain_score >= 85:
factors['domain'] = "High authority domain"
elif domain_score <= 45:
factors['domain'] = "Low authority domain - verify claims"
if recency_score >= 85:
factors['recency'] = "Recent information"
elif recency_score <= 40:
factors['recency'] = "Outdated information - verify currency"
if expertise_score >= 80:
factors['expertise'] = "Expert source"
elif expertise_score <= 45:
factors['expertise'] = "Limited expertise indicators"
if bias_score >= 80:
factors['bias'] = "Balanced perspective"
elif bias_score <= 50:
factors['bias'] = "Potential bias detected"
return factors
def _generate_recommendation(self, overall_score: float) -> str:
"""Generate trust recommendation"""
if overall_score >= 80:
return "high_trust"
elif overall_score >= 60:
return "moderate_trust"
elif overall_score >= 40:
return "low_trust"
else:
return "verify"
# Example usage
if __name__ == '__main__':
evaluator = SourceEvaluator()
# Test sources
test_sources = [
{
'url': 'https://www.nature.com/articles/s41586-2025-12345',
'title': 'Breakthrough in Quantum Computing',
'publication_date': '2025-10-15'
},
{
'url': 'https://someblog.wordpress.com/shocking-discovery',
'title': 'SHOCKING! You Won\'t Believe This Discovery!',
'publication_date': '2020-01-01'
},
{
'url': 'https://docs.python.org/3/library/asyncio.html',
'title': 'asyncio — Asynchronous I/O',
'publication_date': '2025-11-01'
}
]
for source in test_sources:
score = evaluator.evaluate_source(**source)
print(f"\nSource: {source['title']}")
print(f"URL: {source['url']}")
print(f"Overall Score: {score.overall_score}/100")
print(f"Recommendation: {score.recommendation}")
print(f"Factors: {score.factors}")