Optimized the root .gitignore to exclude virtual environments, node modules, and temp folders to ensure clean and lightweight version tracking. Co-authored-by: Cursor <cursoragent@cursor.com>
447 lines
13 KiB
TypeScript
447 lines
13 KiB
TypeScript
import fs from 'fs';
|
||
import path from 'path';
|
||
import { spawnSync } from 'child_process';
|
||
import { tmpdir } from 'os';
|
||
|
||
export const DOC_IMPORT_MAX_FILE_SIZE = 30 * 1024 * 1024;
|
||
export const DOC_IMPORT_MAX_FILE_COUNT = 30;
|
||
export const DOC_IMPORT_MAX_TOTAL_SIZE = 150 * 1024 * 1024;
|
||
|
||
export const DOC_IMPORT_SUPPORTED_EXTENSIONS = new Set([
|
||
'.md',
|
||
'.txt',
|
||
'.pdf',
|
||
'.docx',
|
||
'.pptx',
|
||
'.xlsx',
|
||
'.csv',
|
||
'.json',
|
||
'.html',
|
||
'.xml',
|
||
]);
|
||
|
||
type MarkitdownCommandCandidate = {
|
||
command: string;
|
||
args: string[];
|
||
commandSource: string;
|
||
};
|
||
|
||
type PythonRuntimeProbe = {
|
||
command: string;
|
||
available: boolean;
|
||
versionText: string;
|
||
major: number;
|
||
minor: number;
|
||
meetsRequirement: boolean;
|
||
};
|
||
|
||
export type MarkitdownResolvedCommand = {
|
||
installed: boolean;
|
||
command?: string;
|
||
args?: string[];
|
||
pythonCommand?: string;
|
||
commandSource: string;
|
||
version: string;
|
||
installHints: string[];
|
||
error: string;
|
||
};
|
||
|
||
type MarkitdownOptionalFeature = {
|
||
extension: string;
|
||
feature: string;
|
||
modules: string[];
|
||
};
|
||
|
||
const MARKITDOWN_MIN_PYTHON_MAJOR = 3;
|
||
const MARKITDOWN_MIN_PYTHON_MINOR = 10;
|
||
|
||
const MARKITDOWN_DIRECT_COMMAND_CANDIDATE: MarkitdownCommandCandidate = {
|
||
command: 'markitdown',
|
||
args: [],
|
||
commandSource: 'markitdown',
|
||
};
|
||
|
||
const MARKITDOWN_PYTHON_COMMAND_CANDIDATES = [
|
||
'python3.12',
|
||
'python3.11',
|
||
'python3.10',
|
||
'python3',
|
||
'python',
|
||
];
|
||
|
||
const MARKITDOWN_REQUIRED_OPTIONAL_FEATURES: MarkitdownOptionalFeature[] = [
|
||
{ extension: '.pdf', feature: 'pdf', modules: ['pdfminer', 'pdfplumber'] },
|
||
{ extension: '.docx', feature: 'docx', modules: ['mammoth'] },
|
||
{ extension: '.pptx', feature: 'pptx', modules: ['pptx'] },
|
||
{ extension: '.xlsx', feature: 'xlsx', modules: ['pandas', 'openpyxl'] },
|
||
];
|
||
|
||
function parsePythonVersionText(versionText: string): { major: number; minor: number } | null {
|
||
const match = versionText.match(/Python\s+(\d+)\.(\d+)/i);
|
||
if (!match) return null;
|
||
return {
|
||
major: Number(match[1] || 0),
|
||
minor: Number(match[2] || 0),
|
||
};
|
||
}
|
||
|
||
function isPythonVersionSupported(major: number, minor: number): boolean {
|
||
if (major > MARKITDOWN_MIN_PYTHON_MAJOR) return true;
|
||
if (major < MARKITDOWN_MIN_PYTHON_MAJOR) return false;
|
||
return minor >= MARKITDOWN_MIN_PYTHON_MINOR;
|
||
}
|
||
|
||
function probePythonRuntime(command: string): PythonRuntimeProbe {
|
||
const versionAttempt = spawnSync(command, ['--version'], {
|
||
encoding: 'utf8',
|
||
timeout: 8000,
|
||
maxBuffer: 1024 * 1024,
|
||
});
|
||
if (versionAttempt.error || versionAttempt.status !== 0) {
|
||
return {
|
||
command,
|
||
available: false,
|
||
versionText: '',
|
||
major: 0,
|
||
minor: 0,
|
||
meetsRequirement: false,
|
||
};
|
||
}
|
||
|
||
const versionText = String(versionAttempt.stdout || versionAttempt.stderr || '').trim();
|
||
const parsedVersion = parsePythonVersionText(versionText);
|
||
const major = parsedVersion?.major || 0;
|
||
const minor = parsedVersion?.minor || 0;
|
||
return {
|
||
command,
|
||
available: true,
|
||
versionText,
|
||
major,
|
||
minor,
|
||
meetsRequirement: isPythonVersionSupported(major, minor),
|
||
};
|
||
}
|
||
|
||
function buildMarkitdownInstallHints(preferredPythonCommand?: string): string[] {
|
||
const installCommand = `${preferredPythonCommand || 'python3.11'} -m pip install -U 'markitdown[pdf,docx,pptx,xlsx]'`;
|
||
if (preferredPythonCommand) {
|
||
return [installCommand];
|
||
}
|
||
|
||
return [
|
||
'brew install python@3.11',
|
||
installCommand,
|
||
];
|
||
}
|
||
|
||
function isCommandWorking(candidate: MarkitdownCommandCandidate): {
|
||
success: boolean;
|
||
version: string;
|
||
details: string;
|
||
} {
|
||
const versionAttempt = spawnSync(
|
||
candidate.command,
|
||
[...candidate.args, '--version'],
|
||
{ encoding: 'utf8', timeout: 8000, maxBuffer: 1024 * 1024 },
|
||
);
|
||
const versionOutput = String(versionAttempt.stdout || versionAttempt.stderr || '').trim();
|
||
if (!versionAttempt.error && versionAttempt.status === 0) {
|
||
return {
|
||
success: true,
|
||
version: versionOutput || 'unknown',
|
||
details: '',
|
||
};
|
||
}
|
||
|
||
const helpAttempt = spawnSync(
|
||
candidate.command,
|
||
[...candidate.args, '--help'],
|
||
{ encoding: 'utf8', timeout: 8000, maxBuffer: 1024 * 1024 },
|
||
);
|
||
const helpOutput = String(helpAttempt.stdout || helpAttempt.stderr || '').trim();
|
||
if (!helpAttempt.error && helpAttempt.status === 0) {
|
||
return {
|
||
success: true,
|
||
version: versionOutput || 'available',
|
||
details: '',
|
||
};
|
||
}
|
||
|
||
return {
|
||
success: false,
|
||
version: '',
|
||
details: [versionOutput, helpOutput].filter(Boolean).join('\n'),
|
||
};
|
||
}
|
||
|
||
function resolveMarkitdownPythonCommand(candidate: MarkitdownCommandCandidate): string | undefined {
|
||
if (candidate.args[0] === '-m' && candidate.args[1] === 'markitdown') {
|
||
return candidate.command;
|
||
}
|
||
|
||
const whichAttempt = spawnSync('which', [candidate.command], {
|
||
encoding: 'utf8',
|
||
timeout: 8000,
|
||
maxBuffer: 1024 * 1024,
|
||
});
|
||
if (whichAttempt.error || whichAttempt.status !== 0) {
|
||
return undefined;
|
||
}
|
||
|
||
const executablePath = String(whichAttempt.stdout || '').trim().split(/\r?\n/)[0]?.trim();
|
||
if (!executablePath || !fs.existsSync(executablePath)) {
|
||
return undefined;
|
||
}
|
||
|
||
try {
|
||
const firstLine = fs.readFileSync(executablePath, 'utf8').split(/\r?\n/, 1)[0]?.trim() || '';
|
||
if (!firstLine.startsWith('#!')) {
|
||
return undefined;
|
||
}
|
||
|
||
const shebang = firstLine.slice(2).trim();
|
||
if (!shebang) {
|
||
return undefined;
|
||
}
|
||
|
||
const shebangParts = shebang.split(/\s+/).filter(Boolean);
|
||
if (shebangParts[0] === '/usr/bin/env') {
|
||
return shebangParts[1];
|
||
}
|
||
|
||
return shebangParts[0];
|
||
} catch {
|
||
return undefined;
|
||
}
|
||
}
|
||
|
||
function probeMarkitdownOptionalDependencies(pythonCommand?: string): {
|
||
ready: boolean;
|
||
missingExtensions: string[];
|
||
error: string;
|
||
installHints: string[];
|
||
} {
|
||
if (!pythonCommand) {
|
||
return {
|
||
ready: true,
|
||
missingExtensions: [],
|
||
error: '',
|
||
installHints: [],
|
||
};
|
||
}
|
||
|
||
const checksJson = JSON.stringify(MARKITDOWN_REQUIRED_OPTIONAL_FEATURES);
|
||
const probeScript = [
|
||
'import importlib.util, json',
|
||
`checks = json.loads(${JSON.stringify(checksJson)})`,
|
||
'missing = []',
|
||
'for item in checks:',
|
||
" missing_modules = [name for name in item['modules'] if importlib.util.find_spec(name) is None]",
|
||
' if missing_modules:',
|
||
" missing.append({'extension': item['extension'], 'feature': item['feature'], 'missingModules': missing_modules})",
|
||
"print(json.dumps({'missing': missing}, ensure_ascii=False))",
|
||
].join('\n');
|
||
|
||
const probeAttempt = spawnSync(pythonCommand, ['-c', probeScript], {
|
||
encoding: 'utf8',
|
||
timeout: 8000,
|
||
maxBuffer: 1024 * 1024,
|
||
});
|
||
|
||
if (probeAttempt.error || probeAttempt.status !== 0) {
|
||
return {
|
||
ready: false,
|
||
missingExtensions: [],
|
||
error: `markitdown 依赖检测失败,请执行 ${buildMarkitdownInstallHints(pythonCommand)[0]} 后重试。`,
|
||
installHints: buildMarkitdownInstallHints(pythonCommand),
|
||
};
|
||
}
|
||
|
||
try {
|
||
const payload = JSON.parse(String(probeAttempt.stdout || '{}')) as {
|
||
missing?: Array<{ extension?: string; missingModules?: string[] }>;
|
||
};
|
||
const missing = Array.isArray(payload?.missing) ? payload.missing : [];
|
||
if (missing.length === 0) {
|
||
return {
|
||
ready: true,
|
||
missingExtensions: [],
|
||
error: '',
|
||
installHints: [],
|
||
};
|
||
}
|
||
|
||
const missingDescriptions = missing.map((item) => {
|
||
const extension = String(item?.extension || '').trim() || 'unknown';
|
||
const missingModules = Array.isArray(item?.missingModules) ? item.missingModules.filter(Boolean) : [];
|
||
return `${extension}${missingModules.length > 0 ? `(缺少:${missingModules.join(', ')})` : ''}`;
|
||
});
|
||
|
||
return {
|
||
ready: false,
|
||
missingExtensions: missing.map((item) => String(item?.extension || '').trim()).filter(Boolean),
|
||
error: `markitdown 已安装,但以下格式依赖不完整:${missingDescriptions.join('、')}。请先安装完整依赖后再导入非 .md 文档。`,
|
||
installHints: buildMarkitdownInstallHints(pythonCommand),
|
||
};
|
||
} catch {
|
||
return {
|
||
ready: false,
|
||
missingExtensions: [],
|
||
error: `markitdown 依赖检测结果无法解析,请执行 ${buildMarkitdownInstallHints(pythonCommand)[0]} 后重试。`,
|
||
installHints: buildMarkitdownInstallHints(pythonCommand),
|
||
};
|
||
}
|
||
}
|
||
|
||
export function resolveMarkitdownCommand(): MarkitdownResolvedCommand {
|
||
const directCommandResult = isCommandWorking(MARKITDOWN_DIRECT_COMMAND_CANDIDATE);
|
||
if (directCommandResult.success) {
|
||
const pythonCommand = resolveMarkitdownPythonCommand(MARKITDOWN_DIRECT_COMMAND_CANDIDATE);
|
||
const dependencyProbe = probeMarkitdownOptionalDependencies(pythonCommand);
|
||
return {
|
||
installed: dependencyProbe.ready,
|
||
command: MARKITDOWN_DIRECT_COMMAND_CANDIDATE.command,
|
||
args: MARKITDOWN_DIRECT_COMMAND_CANDIDATE.args,
|
||
pythonCommand,
|
||
commandSource: MARKITDOWN_DIRECT_COMMAND_CANDIDATE.commandSource,
|
||
version: directCommandResult.version,
|
||
installHints: dependencyProbe.installHints,
|
||
error: dependencyProbe.error,
|
||
};
|
||
}
|
||
|
||
const pythonRuntimeProbes = MARKITDOWN_PYTHON_COMMAND_CANDIDATES
|
||
.map((command) => probePythonRuntime(command))
|
||
.filter((probe, index, allProbes) => allProbes.findIndex((item) => item.command === probe.command) === index);
|
||
const supportedPythonRuntimes = pythonRuntimeProbes.filter((probe) => probe.available && probe.meetsRequirement);
|
||
const preferredPythonCommand = supportedPythonRuntimes[0]?.command;
|
||
|
||
let sawLegacyPackage = false;
|
||
let sawModuleMissing = false;
|
||
|
||
for (const pythonRuntime of supportedPythonRuntimes) {
|
||
const pythonCandidate: MarkitdownCommandCandidate = {
|
||
command: pythonRuntime.command,
|
||
args: ['-m', 'markitdown'],
|
||
commandSource: `${pythonRuntime.command} -m markitdown`,
|
||
};
|
||
const candidateResult = isCommandWorking(pythonCandidate);
|
||
if (candidateResult.success) {
|
||
const dependencyProbe = probeMarkitdownOptionalDependencies(pythonRuntime.command);
|
||
return {
|
||
installed: dependencyProbe.ready,
|
||
command: pythonCandidate.command,
|
||
args: pythonCandidate.args,
|
||
pythonCommand: pythonRuntime.command,
|
||
commandSource: pythonCandidate.commandSource,
|
||
version: candidateResult.version,
|
||
installHints: dependencyProbe.installHints,
|
||
error: dependencyProbe.error,
|
||
};
|
||
}
|
||
|
||
const details = candidateResult.details || '';
|
||
if (/markitdown\.__main__|cannot be directly executed/i.test(details)) {
|
||
sawLegacyPackage = true;
|
||
} else if (/No module named markitdown/i.test(details)) {
|
||
sawModuleMissing = true;
|
||
}
|
||
}
|
||
|
||
if (supportedPythonRuntimes.length === 0) {
|
||
const availableVersions = pythonRuntimeProbes
|
||
.filter((probe) => probe.available)
|
||
.map((probe) => `${probe.command} (${probe.versionText || 'unknown'})`)
|
||
.join(', ');
|
||
return {
|
||
installed: false,
|
||
commandSource: 'unavailable',
|
||
version: '',
|
||
installHints: buildMarkitdownInstallHints(),
|
||
error: availableVersions
|
||
? `markitdown 需要 Python 3.10+。当前仅检测到:${availableVersions}`
|
||
: 'markitdown 需要 Python 3.10+。当前未检测到可用的 Python 运行时。',
|
||
};
|
||
}
|
||
|
||
if (sawLegacyPackage) {
|
||
return {
|
||
installed: false,
|
||
commandSource: 'unavailable',
|
||
version: '',
|
||
installHints: buildMarkitdownInstallHints(preferredPythonCommand),
|
||
error: '检测到旧版 markitdown(例如 0.0.1a1),该版本没有 CLI 入口。请在 Python 3.10+ 环境重新安装最新版。',
|
||
};
|
||
}
|
||
|
||
if (sawModuleMissing) {
|
||
return {
|
||
installed: false,
|
||
commandSource: 'unavailable',
|
||
version: '',
|
||
installHints: buildMarkitdownInstallHints(preferredPythonCommand),
|
||
error: `未在 ${preferredPythonCommand || 'Python 3.10+'} 环境中检测到 markitdown,请先安装后重试。`,
|
||
};
|
||
}
|
||
|
||
return {
|
||
installed: false,
|
||
commandSource: 'unavailable',
|
||
version: '',
|
||
installHints: buildMarkitdownInstallHints(preferredPythonCommand),
|
||
error: 'markitdown 不可用,请安装后重试。',
|
||
};
|
||
}
|
||
|
||
export function convertFileToMarkdownWithMarkitdown(params: {
|
||
command: string;
|
||
args: string[];
|
||
sourcePath: string;
|
||
}): { success: true; content: string } | { success: false; error: string } {
|
||
const tempDir = fs.mkdtempSync(path.join(tmpdir(), 'axhub-doc-import-'));
|
||
const outputPath = path.join(tempDir, 'output.md');
|
||
try {
|
||
const result = spawnSync(
|
||
params.command,
|
||
[...params.args, '--keep-data-uris', params.sourcePath, '-o', outputPath],
|
||
{
|
||
encoding: 'utf8',
|
||
timeout: 120000,
|
||
maxBuffer: 1024 * 1024 * 20,
|
||
},
|
||
);
|
||
|
||
if (result.error) {
|
||
return {
|
||
success: false,
|
||
error: result.error.message || 'markitdown execution failed',
|
||
};
|
||
}
|
||
|
||
if (result.status !== 0) {
|
||
const stderr = String(result.stderr || '').trim();
|
||
const stdout = String(result.stdout || '').trim();
|
||
const details = stderr || stdout || `exit code ${result.status}`;
|
||
return {
|
||
success: false,
|
||
error: `markitdown convert failed: ${details}`,
|
||
};
|
||
}
|
||
|
||
if (!fs.existsSync(outputPath)) {
|
||
return {
|
||
success: false,
|
||
error: 'markitdown did not produce output file',
|
||
};
|
||
}
|
||
|
||
const content = fs.readFileSync(outputPath, 'utf8');
|
||
return {
|
||
success: true,
|
||
content,
|
||
};
|
||
} finally {
|
||
fs.rmSync(tempDir, { recursive: true, force: true });
|
||
}
|
||
}
|