|
| 1 | +import tempfile |
| 2 | +import dataclasses |
| 3 | + |
| 4 | +from marker.converters.pdf import PdfConverter |
| 5 | +from marker.models import create_model_dict |
| 6 | +from marker.output import text_from_rendered |
| 7 | +from marker.config.parser import ConfigParser |
| 8 | + |
| 9 | +import cocoindex |
| 10 | + |
| 11 | + |
| 12 | +class PdfToMarkdown(cocoindex.op.FunctionSpec): |
| 13 | + """Convert a PDF to markdown.""" |
| 14 | + |
| 15 | + |
| 16 | +@cocoindex.op.executor_class(gpu=True, cache=True, behavior_version=1) |
| 17 | +class PdfToMarkdownExecutor: |
| 18 | + """Executor for PdfToMarkdown.""" |
| 19 | + |
| 20 | + spec: PdfToMarkdown |
| 21 | + _converter: PdfConverter |
| 22 | + |
| 23 | + def prepare(self): |
| 24 | + config_parser = ConfigParser({}) |
| 25 | + self._converter = PdfConverter( |
| 26 | + create_model_dict(), config=config_parser.generate_config_dict() |
| 27 | + ) |
| 28 | + |
| 29 | + def __call__(self, content: bytes) -> str: |
| 30 | + with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file: |
| 31 | + temp_file.write(content) |
| 32 | + temp_file.flush() |
| 33 | + text, _, _ = text_from_rendered(self._converter(temp_file.name)) |
| 34 | + return text |
| 35 | + |
| 36 | + |
| 37 | +@dataclasses.dataclass |
| 38 | +class ArgInfo: |
| 39 | + """Information about an argument of a method.""" |
| 40 | + |
| 41 | + name: str |
| 42 | + description: str |
| 43 | + |
| 44 | + |
| 45 | +@dataclasses.dataclass |
| 46 | +class MethodInfo: |
| 47 | + """Information about a method.""" |
| 48 | + |
| 49 | + name: str |
| 50 | + args: list[ArgInfo] |
| 51 | + description: str |
| 52 | + |
| 53 | + |
| 54 | +@dataclasses.dataclass |
| 55 | +class ClassInfo: |
| 56 | + """Information about a class.""" |
| 57 | + |
| 58 | + name: str |
| 59 | + description: str |
| 60 | + methods: list[MethodInfo] |
| 61 | + |
| 62 | + |
| 63 | +@dataclasses.dataclass |
| 64 | +class ModuleInfo: |
| 65 | + """Information about a Python module.""" |
| 66 | + |
| 67 | + title: str |
| 68 | + description: str |
| 69 | + classes: list[ClassInfo] |
| 70 | + methods: list[MethodInfo] |
| 71 | + |
| 72 | + |
| 73 | +@dataclasses.dataclass |
| 74 | +class ModuleSummary: |
| 75 | + """Summary info about a Python module.""" |
| 76 | + |
| 77 | + num_classes: int |
| 78 | + num_methods: int |
| 79 | + |
| 80 | + |
| 81 | +@cocoindex.op.function() |
| 82 | +def summarize_module(module_info: ModuleInfo) -> ModuleSummary: |
| 83 | + """Summarize a Python module.""" |
| 84 | + return ModuleSummary( |
| 85 | + num_classes=len(module_info.classes), |
| 86 | + num_methods=len(module_info.methods), |
| 87 | + ) |
| 88 | + |
| 89 | + |
| 90 | +@cocoindex.flow_def(name="ManualExtraction") |
| 91 | +def manual_extraction_flow( |
| 92 | + flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope |
| 93 | +): |
| 94 | + """ |
| 95 | + Define an example flow that extracts manual information from a Markdown. |
| 96 | + """ |
| 97 | + data_scope["documents"] = flow_builder.add_source( |
| 98 | + cocoindex.sources.LocalFile(path="manuals", binary=True) |
| 99 | + ) |
| 100 | + |
| 101 | + modules_index = data_scope.add_collector() |
| 102 | + |
| 103 | + with data_scope["documents"].row() as doc: |
| 104 | + doc["markdown"] = doc["content"].transform(PdfToMarkdown()) |
| 105 | + doc["module_info"] = doc["markdown"].transform( |
| 106 | + cocoindex.functions.ExtractByLlm( |
| 107 | + llm_spec=cocoindex.LlmSpec( |
| 108 | + api_type=cocoindex.LlmApiType.BEDROCK, |
| 109 | + model="anthropic.claude-3-haiku-20240307-v1:0", |
| 110 | + ), |
| 111 | + # Replace by this spec below, to use OpenAI API model instead of ollama |
| 112 | + # llm_spec=cocoindex.LlmSpec( |
| 113 | + # api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"), |
| 114 | + # Replace by this spec below, to use Gemini API model |
| 115 | + # llm_spec=cocoindex.LlmSpec( |
| 116 | + # api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash"), |
| 117 | + # Replace by this spec below, to use Anthropic API model |
| 118 | + # llm_spec=cocoindex.LlmSpec( |
| 119 | + # api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"), |
| 120 | + # Replace by this spec below, to use Ollama API model |
| 121 | + # llm_spec=cocoindex.LlmSpec( |
| 122 | + # api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.2"), |
| 123 | + output_type=ModuleInfo, |
| 124 | + instruction="Please extract Python module information from the manual.", |
| 125 | + ) |
| 126 | + ) |
| 127 | + doc["module_summary"] = doc["module_info"].transform(summarize_module) |
| 128 | + modules_index.collect( |
| 129 | + filename=doc["filename"], |
| 130 | + module_info=doc["module_info"], |
| 131 | + module_summary=doc["module_summary"], |
| 132 | + ) |
| 133 | + |
| 134 | + modules_index.export( |
| 135 | + "modules", |
| 136 | + cocoindex.targets.Postgres(table_name="modules_info"), |
| 137 | + primary_key_fields=["filename"], |
| 138 | + ) |
0 commit comments