#!/usr/bin/env python3 """ Markdown to HTML converter for research reports Properly converts markdown sections to HTML while preserving structure and formatting """ import re from typing import Tuple from pathlib import Path def convert_markdown_to_html(markdown_text: str) -> Tuple[str, str]: """ Convert markdown to HTML in two parts: content and bibliography Args: markdown_text: Full markdown report text Returns: Tuple of (content_html, bibliography_html) """ # Split content and bibliography parts = markdown_text.split('## Bibliography') content_md = parts[0] bibliography_md = parts[1] if len(parts) > 1 else "" # Convert content (everything except bibliography) content_html = _convert_content_section(content_md) # Convert bibliography separately bibliography_html = _convert_bibliography_section(bibliography_md) return content_html, bibliography_html def _convert_content_section(markdown: str) -> str: """Convert main content sections to HTML""" html = markdown # Remove title and front matter (first ## heading is handled separately) lines = html.split('\n') processed_lines = [] skip_until_first_section = True for line in lines: # Skip everything until we hit "## Executive Summary" or first major section if skip_until_first_section: if line.startswith('## ') and not line.startswith('### '): skip_until_first_section = False processed_lines.append(line) continue processed_lines.append(line) html = '\n'.join(processed_lines) # Convert headers # ## Section Title →

Section Title

html = re.sub( r'^## (.+)$', r'

\1

', html, flags=re.MULTILINE ) # ### Subsection →

Subsection

html = re.sub( r'^### (.+)$', r'

\1

', html, flags=re.MULTILINE ) # #### Subsubsection →

Title

html = re.sub( r'^#### (.+)$', r'

\1

', html, flags=re.MULTILINE ) # Convert **bold** text html = re.sub(r'\*\*(.+?)\*\*', r'\1', html) # Convert *italic* text html = re.sub(r'\*(.+?)\*', r'\1', html) # Convert inline code `code` html = re.sub(r'`(.+?)`', r'\1', html) # Convert unordered lists html = _convert_lists(html) # Convert tables html = _convert_tables(html) # Convert paragraphs (wrap non-HTML lines in

tags) html = _convert_paragraphs(html) # Close all open sections html = _close_sections(html) # Wrap executive summary if present html = html.replace( '

Executive Summary

', '

Executive Summary

' ) if '
' in html: # Close executive summary at the next section html = html.replace( '\n
', '
\n
', 1 ) return html def _convert_bibliography_section(markdown: str) -> str: """Convert bibliography section to HTML""" if not markdown.strip(): return "" html = markdown # Convert each [N] citation to a proper bibliography entry # Look for patterns like [1] Title - URL html = re.sub( r'\[(\d+)\]\s*(.+?)\s*-\s*(https?://[^\s\)]+)', r'
[\1] \2
', html ) # Convert any remaining **bold** sections html = re.sub(r'\*\*(.+?)\*\*', r'\1', html) # Wrap in bibliography content div html = f'
{html}
' return html def _convert_lists(html: str) -> str: """Convert markdown lists to HTML lists""" lines = html.split('\n') result = [] in_list = False list_level = 0 for i, line in enumerate(lines): stripped = line.strip() # Check for unordered list item if stripped.startswith('- ') or stripped.startswith('* '): if not in_list: result.append('
    ') in_list = True list_level = len(line) - len(line.lstrip()) # Get the content after the marker content = stripped[2:] result.append(f'
  • {content}
  • ') # Check for ordered list item elif re.match(r'^\d+\.\s', stripped): if not in_list: result.append('
      ') in_list = True list_level = len(line) - len(line.lstrip()) # Get the content after the number and period content = re.sub(r'^\d+\.\s', '', stripped) result.append(f'
    1. {content}
    2. ') else: # Not a list item if in_list: # Check if we're still in the list (indented continuation) current_level = len(line) - len(line.lstrip()) if current_level > list_level and stripped: # Continuation of previous list item if result[-1].endswith(''): result[-1] = result[-1][:-5] + ' ' + stripped + '' continue else: # End of list result.append('
' if '
    ' in '\n'.join(result[-10:]) else '') in_list = False list_level = 0 result.append(line) # Close any remaining open list if in_list: result.append('
' if '
    ' in '\n'.join(result[-10:]) else '') return '\n'.join(result) def _convert_tables(html: str) -> str: """Convert markdown tables to HTML tables""" lines = html.split('\n') result = [] in_table = False for i, line in enumerate(lines): if '|' in line and line.strip().startswith('|'): if not in_table: result.append('') in_table = True # This is the header row cells = [cell.strip() for cell in line.split('|')[1:-1]] result.append('') for cell in cells: result.append(f'') result.append('') result.append('') elif '---' in line: # Skip separator row continue else: # Data row cells = [cell.strip() for cell in line.split('|')[1:-1]] result.append('') for cell in cells: result.append(f'') result.append('') else: if in_table: result.append('
    {cell}
    {cell}
    ') in_table = False result.append(line) if in_table: result.append('') return '\n'.join(result) def _convert_paragraphs(html: str) -> str: """Wrap non-HTML lines in paragraph tags""" lines = html.split('\n') result = [] in_paragraph = False for line in lines: stripped = line.strip() # Skip empty lines if not stripped: if in_paragraph: result.append('

    ') in_paragraph = False result.append(line) continue # Skip lines that are already HTML tags if (stripped.startswith('<') and stripped.endswith('>')) or \ stripped.startswith('' in stripped or '
' in stripped or '' in stripped: if in_paragraph: result.append('

') in_paragraph = False result.append(line) continue # Regular text line - wrap in paragraph if not in_paragraph: result.append('

' + line) in_paragraph = True else: result.append(line) if in_paragraph: result.append('

') return '\n'.join(result) def _close_sections(html: str) -> str: """Close all open section divs""" # Count open and closed divs open_divs = html.count('
') closed_divs = html.count('
') # Add closing divs for sections # Each section should be closed before the next section starts lines = html.split('\n') result = [] section_open = False for i, line in enumerate(lines): if '
' in line: if section_open: result.append('
') # Close previous section section_open = True result.append(line) # Close final section if still open if section_open: result.append('
') return '\n'.join(result) def main(): """Test the converter with a sample markdown file""" import sys if len(sys.argv) < 2: print("Usage: python md_to_html.py ") sys.exit(1) md_file = Path(sys.argv[1]) if not md_file.exists(): print(f"Error: File {md_file} not found") sys.exit(1) markdown_text = md_file.read_text() content_html, bib_html = convert_markdown_to_html(markdown_text) print("=== CONTENT HTML ===") print(content_html[:1000]) print("\n=== BIBLIOGRAPHY HTML ===") print(bib_html[:500]) if __name__ == "__main__": main()