feat: sync full workspace including web modules, docs, and configurations to Gitea

Optimized the root .gitignore to exclude virtual environments, node modules,
and temp folders to ensure clean and lightweight version tracking.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
王冕
2026-06-09 18:12:25 +08:00
parent 351688006e
commit a27e3b8e43
1510 changed files with 162044 additions and 1517 deletions

View File

@@ -0,0 +1,446 @@
import fs from 'fs';
import path from 'path';
import { spawnSync } from 'child_process';
import { tmpdir } from 'os';
export const DOC_IMPORT_MAX_FILE_SIZE = 30 * 1024 * 1024;
export const DOC_IMPORT_MAX_FILE_COUNT = 30;
export const DOC_IMPORT_MAX_TOTAL_SIZE = 150 * 1024 * 1024;
export const DOC_IMPORT_SUPPORTED_EXTENSIONS = new Set([
'.md',
'.txt',
'.pdf',
'.docx',
'.pptx',
'.xlsx',
'.csv',
'.json',
'.html',
'.xml',
]);
type MarkitdownCommandCandidate = {
command: string;
args: string[];
commandSource: string;
};
type PythonRuntimeProbe = {
command: string;
available: boolean;
versionText: string;
major: number;
minor: number;
meetsRequirement: boolean;
};
export type MarkitdownResolvedCommand = {
installed: boolean;
command?: string;
args?: string[];
pythonCommand?: string;
commandSource: string;
version: string;
installHints: string[];
error: string;
};
type MarkitdownOptionalFeature = {
extension: string;
feature: string;
modules: string[];
};
const MARKITDOWN_MIN_PYTHON_MAJOR = 3;
const MARKITDOWN_MIN_PYTHON_MINOR = 10;
const MARKITDOWN_DIRECT_COMMAND_CANDIDATE: MarkitdownCommandCandidate = {
command: 'markitdown',
args: [],
commandSource: 'markitdown',
};
const MARKITDOWN_PYTHON_COMMAND_CANDIDATES = [
'python3.12',
'python3.11',
'python3.10',
'python3',
'python',
];
const MARKITDOWN_REQUIRED_OPTIONAL_FEATURES: MarkitdownOptionalFeature[] = [
{ extension: '.pdf', feature: 'pdf', modules: ['pdfminer', 'pdfplumber'] },
{ extension: '.docx', feature: 'docx', modules: ['mammoth'] },
{ extension: '.pptx', feature: 'pptx', modules: ['pptx'] },
{ extension: '.xlsx', feature: 'xlsx', modules: ['pandas', 'openpyxl'] },
];
function parsePythonVersionText(versionText: string): { major: number; minor: number } | null {
const match = versionText.match(/Python\s+(\d+)\.(\d+)/i);
if (!match) return null;
return {
major: Number(match[1] || 0),
minor: Number(match[2] || 0),
};
}
function isPythonVersionSupported(major: number, minor: number): boolean {
if (major > MARKITDOWN_MIN_PYTHON_MAJOR) return true;
if (major < MARKITDOWN_MIN_PYTHON_MAJOR) return false;
return minor >= MARKITDOWN_MIN_PYTHON_MINOR;
}
function probePythonRuntime(command: string): PythonRuntimeProbe {
const versionAttempt = spawnSync(command, ['--version'], {
encoding: 'utf8',
timeout: 8000,
maxBuffer: 1024 * 1024,
});
if (versionAttempt.error || versionAttempt.status !== 0) {
return {
command,
available: false,
versionText: '',
major: 0,
minor: 0,
meetsRequirement: false,
};
}
const versionText = String(versionAttempt.stdout || versionAttempt.stderr || '').trim();
const parsedVersion = parsePythonVersionText(versionText);
const major = parsedVersion?.major || 0;
const minor = parsedVersion?.minor || 0;
return {
command,
available: true,
versionText,
major,
minor,
meetsRequirement: isPythonVersionSupported(major, minor),
};
}
function buildMarkitdownInstallHints(preferredPythonCommand?: string): string[] {
const installCommand = `${preferredPythonCommand || 'python3.11'} -m pip install -U 'markitdown[pdf,docx,pptx,xlsx]'`;
if (preferredPythonCommand) {
return [installCommand];
}
return [
'brew install python@3.11',
installCommand,
];
}
function isCommandWorking(candidate: MarkitdownCommandCandidate): {
success: boolean;
version: string;
details: string;
} {
const versionAttempt = spawnSync(
candidate.command,
[...candidate.args, '--version'],
{ encoding: 'utf8', timeout: 8000, maxBuffer: 1024 * 1024 },
);
const versionOutput = String(versionAttempt.stdout || versionAttempt.stderr || '').trim();
if (!versionAttempt.error && versionAttempt.status === 0) {
return {
success: true,
version: versionOutput || 'unknown',
details: '',
};
}
const helpAttempt = spawnSync(
candidate.command,
[...candidate.args, '--help'],
{ encoding: 'utf8', timeout: 8000, maxBuffer: 1024 * 1024 },
);
const helpOutput = String(helpAttempt.stdout || helpAttempt.stderr || '').trim();
if (!helpAttempt.error && helpAttempt.status === 0) {
return {
success: true,
version: versionOutput || 'available',
details: '',
};
}
return {
success: false,
version: '',
details: [versionOutput, helpOutput].filter(Boolean).join('\n'),
};
}
function resolveMarkitdownPythonCommand(candidate: MarkitdownCommandCandidate): string | undefined {
if (candidate.args[0] === '-m' && candidate.args[1] === 'markitdown') {
return candidate.command;
}
const whichAttempt = spawnSync('which', [candidate.command], {
encoding: 'utf8',
timeout: 8000,
maxBuffer: 1024 * 1024,
});
if (whichAttempt.error || whichAttempt.status !== 0) {
return undefined;
}
const executablePath = String(whichAttempt.stdout || '').trim().split(/\r?\n/)[0]?.trim();
if (!executablePath || !fs.existsSync(executablePath)) {
return undefined;
}
try {
const firstLine = fs.readFileSync(executablePath, 'utf8').split(/\r?\n/, 1)[0]?.trim() || '';
if (!firstLine.startsWith('#!')) {
return undefined;
}
const shebang = firstLine.slice(2).trim();
if (!shebang) {
return undefined;
}
const shebangParts = shebang.split(/\s+/).filter(Boolean);
if (shebangParts[0] === '/usr/bin/env') {
return shebangParts[1];
}
return shebangParts[0];
} catch {
return undefined;
}
}
function probeMarkitdownOptionalDependencies(pythonCommand?: string): {
ready: boolean;
missingExtensions: string[];
error: string;
installHints: string[];
} {
if (!pythonCommand) {
return {
ready: true,
missingExtensions: [],
error: '',
installHints: [],
};
}
const checksJson = JSON.stringify(MARKITDOWN_REQUIRED_OPTIONAL_FEATURES);
const probeScript = [
'import importlib.util, json',
`checks = json.loads(${JSON.stringify(checksJson)})`,
'missing = []',
'for item in checks:',
" missing_modules = [name for name in item['modules'] if importlib.util.find_spec(name) is None]",
' if missing_modules:',
" missing.append({'extension': item['extension'], 'feature': item['feature'], 'missingModules': missing_modules})",
"print(json.dumps({'missing': missing}, ensure_ascii=False))",
].join('\n');
const probeAttempt = spawnSync(pythonCommand, ['-c', probeScript], {
encoding: 'utf8',
timeout: 8000,
maxBuffer: 1024 * 1024,
});
if (probeAttempt.error || probeAttempt.status !== 0) {
return {
ready: false,
missingExtensions: [],
error: `markitdown 依赖检测失败,请执行 ${buildMarkitdownInstallHints(pythonCommand)[0]} 后重试。`,
installHints: buildMarkitdownInstallHints(pythonCommand),
};
}
try {
const payload = JSON.parse(String(probeAttempt.stdout || '{}')) as {
missing?: Array<{ extension?: string; missingModules?: string[] }>;
};
const missing = Array.isArray(payload?.missing) ? payload.missing : [];
if (missing.length === 0) {
return {
ready: true,
missingExtensions: [],
error: '',
installHints: [],
};
}
const missingDescriptions = missing.map((item) => {
const extension = String(item?.extension || '').trim() || 'unknown';
const missingModules = Array.isArray(item?.missingModules) ? item.missingModules.filter(Boolean) : [];
return `${extension}${missingModules.length > 0 ? `(缺少:${missingModules.join(', ')}` : ''}`;
});
return {
ready: false,
missingExtensions: missing.map((item) => String(item?.extension || '').trim()).filter(Boolean),
error: `markitdown 已安装,但以下格式依赖不完整:${missingDescriptions.join('、')}。请先安装完整依赖后再导入非 .md 文档。`,
installHints: buildMarkitdownInstallHints(pythonCommand),
};
} catch {
return {
ready: false,
missingExtensions: [],
error: `markitdown 依赖检测结果无法解析,请执行 ${buildMarkitdownInstallHints(pythonCommand)[0]} 后重试。`,
installHints: buildMarkitdownInstallHints(pythonCommand),
};
}
}
export function resolveMarkitdownCommand(): MarkitdownResolvedCommand {
const directCommandResult = isCommandWorking(MARKITDOWN_DIRECT_COMMAND_CANDIDATE);
if (directCommandResult.success) {
const pythonCommand = resolveMarkitdownPythonCommand(MARKITDOWN_DIRECT_COMMAND_CANDIDATE);
const dependencyProbe = probeMarkitdownOptionalDependencies(pythonCommand);
return {
installed: dependencyProbe.ready,
command: MARKITDOWN_DIRECT_COMMAND_CANDIDATE.command,
args: MARKITDOWN_DIRECT_COMMAND_CANDIDATE.args,
pythonCommand,
commandSource: MARKITDOWN_DIRECT_COMMAND_CANDIDATE.commandSource,
version: directCommandResult.version,
installHints: dependencyProbe.installHints,
error: dependencyProbe.error,
};
}
const pythonRuntimeProbes = MARKITDOWN_PYTHON_COMMAND_CANDIDATES
.map((command) => probePythonRuntime(command))
.filter((probe, index, allProbes) => allProbes.findIndex((item) => item.command === probe.command) === index);
const supportedPythonRuntimes = pythonRuntimeProbes.filter((probe) => probe.available && probe.meetsRequirement);
const preferredPythonCommand = supportedPythonRuntimes[0]?.command;
let sawLegacyPackage = false;
let sawModuleMissing = false;
for (const pythonRuntime of supportedPythonRuntimes) {
const pythonCandidate: MarkitdownCommandCandidate = {
command: pythonRuntime.command,
args: ['-m', 'markitdown'],
commandSource: `${pythonRuntime.command} -m markitdown`,
};
const candidateResult = isCommandWorking(pythonCandidate);
if (candidateResult.success) {
const dependencyProbe = probeMarkitdownOptionalDependencies(pythonRuntime.command);
return {
installed: dependencyProbe.ready,
command: pythonCandidate.command,
args: pythonCandidate.args,
pythonCommand: pythonRuntime.command,
commandSource: pythonCandidate.commandSource,
version: candidateResult.version,
installHints: dependencyProbe.installHints,
error: dependencyProbe.error,
};
}
const details = candidateResult.details || '';
if (/markitdown\.__main__|cannot be directly executed/i.test(details)) {
sawLegacyPackage = true;
} else if (/No module named markitdown/i.test(details)) {
sawModuleMissing = true;
}
}
if (supportedPythonRuntimes.length === 0) {
const availableVersions = pythonRuntimeProbes
.filter((probe) => probe.available)
.map((probe) => `${probe.command} (${probe.versionText || 'unknown'})`)
.join(', ');
return {
installed: false,
commandSource: 'unavailable',
version: '',
installHints: buildMarkitdownInstallHints(),
error: availableVersions
? `markitdown 需要 Python 3.10+。当前仅检测到:${availableVersions}`
: 'markitdown 需要 Python 3.10+。当前未检测到可用的 Python 运行时。',
};
}
if (sawLegacyPackage) {
return {
installed: false,
commandSource: 'unavailable',
version: '',
installHints: buildMarkitdownInstallHints(preferredPythonCommand),
error: '检测到旧版 markitdown例如 0.0.1a1),该版本没有 CLI 入口。请在 Python 3.10+ 环境重新安装最新版。',
};
}
if (sawModuleMissing) {
return {
installed: false,
commandSource: 'unavailable',
version: '',
installHints: buildMarkitdownInstallHints(preferredPythonCommand),
error: `未在 ${preferredPythonCommand || 'Python 3.10+'} 环境中检测到 markitdown请先安装后重试。`,
};
}
return {
installed: false,
commandSource: 'unavailable',
version: '',
installHints: buildMarkitdownInstallHints(preferredPythonCommand),
error: 'markitdown 不可用,请安装后重试。',
};
}
export function convertFileToMarkdownWithMarkitdown(params: {
command: string;
args: string[];
sourcePath: string;
}): { success: true; content: string } | { success: false; error: string } {
const tempDir = fs.mkdtempSync(path.join(tmpdir(), 'axhub-doc-import-'));
const outputPath = path.join(tempDir, 'output.md');
try {
const result = spawnSync(
params.command,
[...params.args, '--keep-data-uris', params.sourcePath, '-o', outputPath],
{
encoding: 'utf8',
timeout: 120000,
maxBuffer: 1024 * 1024 * 20,
},
);
if (result.error) {
return {
success: false,
error: result.error.message || 'markitdown execution failed',
};
}
if (result.status !== 0) {
const stderr = String(result.stderr || '').trim();
const stdout = String(result.stdout || '').trim();
const details = stderr || stdout || `exit code ${result.status}`;
return {
success: false,
error: `markitdown convert failed: ${details}`,
};
}
if (!fs.existsSync(outputPath)) {
return {
success: false,
error: 'markitdown did not produce output file',
};
}
const content = fs.readFileSync(outputPath, 'utf8');
return {
success: true,
content,
};
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}