feat: sync full workspace including web modules, docs, and configurations to Gitea
Optimized the root .gitignore to exclude virtual environments, node modules, and temp folders to ensure clean and lightweight version tracking. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
330
axhub-make/skills/third-party/deep-research/scripts/md_to_html.py
vendored
Normal file
330
axhub-make/skills/third-party/deep-research/scripts/md_to_html.py
vendored
Normal file
@@ -0,0 +1,330 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Markdown to HTML converter for research reports
|
||||
Properly converts markdown sections to HTML while preserving structure and formatting
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Tuple
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def convert_markdown_to_html(markdown_text: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Convert markdown to HTML in two parts: content and bibliography
|
||||
|
||||
Args:
|
||||
markdown_text: Full markdown report text
|
||||
|
||||
Returns:
|
||||
Tuple of (content_html, bibliography_html)
|
||||
"""
|
||||
# Split content and bibliography
|
||||
parts = markdown_text.split('## Bibliography')
|
||||
content_md = parts[0]
|
||||
bibliography_md = parts[1] if len(parts) > 1 else ""
|
||||
|
||||
# Convert content (everything except bibliography)
|
||||
content_html = _convert_content_section(content_md)
|
||||
|
||||
# Convert bibliography separately
|
||||
bibliography_html = _convert_bibliography_section(bibliography_md)
|
||||
|
||||
return content_html, bibliography_html
|
||||
|
||||
|
||||
def _convert_content_section(markdown: str) -> str:
|
||||
"""Convert main content sections to HTML"""
|
||||
html = markdown
|
||||
|
||||
# Remove title and front matter (first ## heading is handled separately)
|
||||
lines = html.split('\n')
|
||||
processed_lines = []
|
||||
skip_until_first_section = True
|
||||
|
||||
for line in lines:
|
||||
# Skip everything until we hit "## Executive Summary" or first major section
|
||||
if skip_until_first_section:
|
||||
if line.startswith('## ') and not line.startswith('### '):
|
||||
skip_until_first_section = False
|
||||
processed_lines.append(line)
|
||||
continue
|
||||
processed_lines.append(line)
|
||||
|
||||
html = '\n'.join(processed_lines)
|
||||
|
||||
# Convert headers
|
||||
# ## Section Title → <div class="section"><h2 class="section-title">Section Title</h2></div>
|
||||
html = re.sub(
|
||||
r'^## (.+)$',
|
||||
r'<div class="section"><h2 class="section-title">\1</h2>',
|
||||
html,
|
||||
flags=re.MULTILINE
|
||||
)
|
||||
|
||||
# ### Subsection → <h3 class="subsection-title">Subsection</h3>
|
||||
html = re.sub(
|
||||
r'^### (.+)$',
|
||||
r'<h3 class="subsection-title">\1</h3>',
|
||||
html,
|
||||
flags=re.MULTILINE
|
||||
)
|
||||
|
||||
# #### Subsubsection → <h4 class="subsubsection-title">Title</h4>
|
||||
html = re.sub(
|
||||
r'^#### (.+)$',
|
||||
r'<h4 class="subsubsection-title">\1</h4>',
|
||||
html,
|
||||
flags=re.MULTILINE
|
||||
)
|
||||
|
||||
# Convert **bold** text
|
||||
html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html)
|
||||
|
||||
# Convert *italic* text
|
||||
html = re.sub(r'\*(.+?)\*', r'<em>\1</em>', html)
|
||||
|
||||
# Convert inline code `code`
|
||||
html = re.sub(r'`(.+?)`', r'<code>\1</code>', html)
|
||||
|
||||
# Convert unordered lists
|
||||
html = _convert_lists(html)
|
||||
|
||||
# Convert tables
|
||||
html = _convert_tables(html)
|
||||
|
||||
# Convert paragraphs (wrap non-HTML lines in <p> tags)
|
||||
html = _convert_paragraphs(html)
|
||||
|
||||
# Close all open sections
|
||||
html = _close_sections(html)
|
||||
|
||||
# Wrap executive summary if present
|
||||
html = html.replace(
|
||||
'<h2 class="section-title">Executive Summary</h2>',
|
||||
'<div class="executive-summary"><h2 class="section-title">Executive Summary</h2>'
|
||||
)
|
||||
if '<div class="executive-summary">' in html:
|
||||
# Close executive summary at the next section
|
||||
html = html.replace(
|
||||
'</h2>\n<div class="section">',
|
||||
'</h2></div>\n<div class="section">',
|
||||
1
|
||||
)
|
||||
|
||||
return html
|
||||
|
||||
|
||||
def _convert_bibliography_section(markdown: str) -> str:
|
||||
"""Convert bibliography section to HTML"""
|
||||
if not markdown.strip():
|
||||
return ""
|
||||
|
||||
html = markdown
|
||||
|
||||
# Convert each [N] citation to a proper bibliography entry
|
||||
# Look for patterns like [1] Title - URL
|
||||
html = re.sub(
|
||||
r'\[(\d+)\]\s*(.+?)\s*-\s*(https?://[^\s\)]+)',
|
||||
r'<div class="bib-entry"><span class="bib-number">[\1]</span> <a href="\3" target="_blank">\2</a></div>',
|
||||
html
|
||||
)
|
||||
|
||||
# Convert any remaining **bold** sections
|
||||
html = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', html)
|
||||
|
||||
# Wrap in bibliography content div
|
||||
html = f'<div class="bibliography-content">{html}</div>'
|
||||
|
||||
return html
|
||||
|
||||
|
||||
def _convert_lists(html: str) -> str:
|
||||
"""Convert markdown lists to HTML lists"""
|
||||
lines = html.split('\n')
|
||||
result = []
|
||||
in_list = False
|
||||
list_level = 0
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
stripped = line.strip()
|
||||
|
||||
# Check for unordered list item
|
||||
if stripped.startswith('- ') or stripped.startswith('* '):
|
||||
if not in_list:
|
||||
result.append('<ul>')
|
||||
in_list = True
|
||||
list_level = len(line) - len(line.lstrip())
|
||||
|
||||
# Get the content after the marker
|
||||
content = stripped[2:]
|
||||
result.append(f'<li>{content}</li>')
|
||||
|
||||
# Check for ordered list item
|
||||
elif re.match(r'^\d+\.\s', stripped):
|
||||
if not in_list:
|
||||
result.append('<ol>')
|
||||
in_list = True
|
||||
list_level = len(line) - len(line.lstrip())
|
||||
|
||||
# Get the content after the number and period
|
||||
content = re.sub(r'^\d+\.\s', '', stripped)
|
||||
result.append(f'<li>{content}</li>')
|
||||
|
||||
else:
|
||||
# Not a list item
|
||||
if in_list:
|
||||
# Check if we're still in the list (indented continuation)
|
||||
current_level = len(line) - len(line.lstrip())
|
||||
if current_level > list_level and stripped:
|
||||
# Continuation of previous list item
|
||||
if result[-1].endswith('</li>'):
|
||||
result[-1] = result[-1][:-5] + ' ' + stripped + '</li>'
|
||||
continue
|
||||
else:
|
||||
# End of list
|
||||
result.append('</ul>' if '<ul>' in '\n'.join(result[-10:]) else '</ol>')
|
||||
in_list = False
|
||||
list_level = 0
|
||||
|
||||
result.append(line)
|
||||
|
||||
# Close any remaining open list
|
||||
if in_list:
|
||||
result.append('</ul>' if '<ul>' in '\n'.join(result[-10:]) else '</ol>')
|
||||
|
||||
return '\n'.join(result)
|
||||
|
||||
|
||||
def _convert_tables(html: str) -> str:
|
||||
"""Convert markdown tables to HTML tables"""
|
||||
lines = html.split('\n')
|
||||
result = []
|
||||
in_table = False
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if '|' in line and line.strip().startswith('|'):
|
||||
if not in_table:
|
||||
result.append('<table>')
|
||||
in_table = True
|
||||
# This is the header row
|
||||
cells = [cell.strip() for cell in line.split('|')[1:-1]]
|
||||
result.append('<thead><tr>')
|
||||
for cell in cells:
|
||||
result.append(f'<th>{cell}</th>')
|
||||
result.append('</tr></thead>')
|
||||
result.append('<tbody>')
|
||||
elif '---' in line:
|
||||
# Skip separator row
|
||||
continue
|
||||
else:
|
||||
# Data row
|
||||
cells = [cell.strip() for cell in line.split('|')[1:-1]]
|
||||
result.append('<tr>')
|
||||
for cell in cells:
|
||||
result.append(f'<td>{cell}</td>')
|
||||
result.append('</tr>')
|
||||
else:
|
||||
if in_table:
|
||||
result.append('</tbody></table>')
|
||||
in_table = False
|
||||
result.append(line)
|
||||
|
||||
if in_table:
|
||||
result.append('</tbody></table>')
|
||||
|
||||
return '\n'.join(result)
|
||||
|
||||
|
||||
def _convert_paragraphs(html: str) -> str:
|
||||
"""Wrap non-HTML lines in paragraph tags"""
|
||||
lines = html.split('\n')
|
||||
result = []
|
||||
in_paragraph = False
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
|
||||
# Skip empty lines
|
||||
if not stripped:
|
||||
if in_paragraph:
|
||||
result.append('</p>')
|
||||
in_paragraph = False
|
||||
result.append(line)
|
||||
continue
|
||||
|
||||
# Skip lines that are already HTML tags
|
||||
if (stripped.startswith('<') and stripped.endswith('>')) or \
|
||||
stripped.startswith('</') or \
|
||||
'<h' in stripped or '<div' in stripped or '<ul' in stripped or \
|
||||
'<ol' in stripped or '<li' in stripped or '<table' in stripped or \
|
||||
'</div>' in stripped or '</ul>' in stripped or '</ol>' in stripped:
|
||||
if in_paragraph:
|
||||
result.append('</p>')
|
||||
in_paragraph = False
|
||||
result.append(line)
|
||||
continue
|
||||
|
||||
# Regular text line - wrap in paragraph
|
||||
if not in_paragraph:
|
||||
result.append('<p>' + line)
|
||||
in_paragraph = True
|
||||
else:
|
||||
result.append(line)
|
||||
|
||||
if in_paragraph:
|
||||
result.append('</p>')
|
||||
|
||||
return '\n'.join(result)
|
||||
|
||||
|
||||
def _close_sections(html: str) -> str:
|
||||
"""Close all open section divs"""
|
||||
# Count open and closed divs
|
||||
open_divs = html.count('<div class="section">')
|
||||
closed_divs = html.count('</div>')
|
||||
|
||||
# Add closing divs for sections
|
||||
# Each section should be closed before the next section starts
|
||||
lines = html.split('\n')
|
||||
result = []
|
||||
section_open = False
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if '<div class="section">' in line:
|
||||
if section_open:
|
||||
result.append('</div>') # Close previous section
|
||||
section_open = True
|
||||
result.append(line)
|
||||
|
||||
# Close final section if still open
|
||||
if section_open:
|
||||
result.append('</div>')
|
||||
|
||||
return '\n'.join(result)
|
||||
|
||||
|
||||
def main():
|
||||
"""Test the converter with a sample markdown file"""
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python md_to_html.py <markdown_file>")
|
||||
sys.exit(1)
|
||||
|
||||
md_file = Path(sys.argv[1])
|
||||
if not md_file.exists():
|
||||
print(f"Error: File {md_file} not found")
|
||||
sys.exit(1)
|
||||
|
||||
markdown_text = md_file.read_text()
|
||||
content_html, bib_html = convert_markdown_to_html(markdown_text)
|
||||
|
||||
print("=== CONTENT HTML ===")
|
||||
print(content_html[:1000])
|
||||
print("\n=== BIBLIOGRAPHY HTML ===")
|
||||
print(bib_html[:500])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user