Skip to content

Commit

Permalink
feat: type strategy output (#216)
Browse files Browse the repository at this point in the history
* feat: create first format modules

* add: example file

* add: structured output formatter

* fix: all parsers outputs list of elements & compatibility formatters

* feat: new basemodel for document

* add: structured output

* fix: test

* fix: add uncategorized text handling

* add: skip on flaky pdf

* add: section block

* fix: change load logic & reate page element

* fix: add pages

* add: split onnxtr det and reco

* feat: Doctr in MegaParse

* fix : Update ReadMe

* fix: add config as constructor parameters

* add: to_numpy to bbox
  • Loading branch information
chloedia authored Jan 16, 2025
1 parent a2170d7 commit deb8765
Show file tree
Hide file tree
Showing 20 changed files with 994 additions and 556 deletions.
15 changes: 3 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,34 +41,25 @@ pip install megaparse

4. If you have a mac, you also need to install libmagic ```brew install libmagic```


Use MegaParse as it is :
```python
from megaparse import MegaParse
from langchain_openai import ChatOpenAI
from megaparse.parser.unstructured_parser import UnstructuredParser

parser = UnstructuredParser()
megaparse = MegaParse(parser)
megaparse = MegaParse()
response = megaparse.load("./test.pdf")
print(response)
megaparse.save("./test.md")
```

### Use MegaParse Vision

* Change the parser to MegaParseVision

```python
from megaparse import MegaParse
from langchain_openai import ChatOpenAI
from megaparse.parser.megaparse_vision import MegaParseVision

model = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")) # type: ignore
parser = MegaParseVision(model=model)
megaparse = MegaParse(parser)
response = megaparse.load("./test.pdf")
response = parser.convert("./test.pdf")
print(response)
megaparse.save("./test.md")

```
**Note**: The model supported by MegaParse Vision are the multimodal ones such as claude 3.5, claude 4, gpt-4o and gpt-4.
Expand Down
4 changes: 2 additions & 2 deletions benchmark/process_single_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ async def process_file(megaparse: MegaParse, file_path: str | Path):


async def test_process_file(file: str | Path):
parser = UnstructuredParser(strategy=StrategyEnum.HI_RES)
megaparse = MegaParse(parser=parser)
# parser = UnstructuredParser(strategy=StrategyEnum.HI_RES)
megaparse = MegaParse()
task = []
for _ in range(N_TRY):
task.append(process_file(megaparse, file))
Expand Down
3 changes: 2 additions & 1 deletion evaluations/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from megaparse.parser.llama import LlamaParser
from megaparse.parser.megaparse_vision import MegaParseVision
from megaparse.parser.unstructured_parser import UnstructuredParser
from megaparse_sdk.schema.parser_config import StrategyEnum

if __name__ == "__main__":
print("---Launching evaluations script---")
Expand All @@ -29,7 +30,7 @@

for method, parser in parser_dict.items():
print(f"Method: {method}")
megaparse = MegaParse(parser=parser)
megaparse = MegaParse()
result = megaparse.load(file_path=base_pdf_path)
score_dict[method] = difflib.SequenceMatcher(None, base_md, result).ratio()
print(f"Score for method {method}: {score_dict[method]}")
Expand Down
22 changes: 10 additions & 12 deletions libs/megaparse/src/megaparse/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,16 @@ async def parse_file(
else:
raise HTTPModelNotSupported()

parser_config = ParseFileConfig(
method=method,
strategy=strategy,
model=model if model and check_table else None,
language=language,
parsing_instruction=parsing_instruction,
)
# parser_config = ParseFileConfig( #FIXME
# method=method,
# strategy=strategy,
# llm_model_name=SupportedModel(model_name) if model_name and check_table else None,
# language=language,
# parsing_instruction=parsing_instruction,
# )
try:
parser = parser_builder.build(parser_config)
megaparse = MegaParse(parser=parser)
# parser = parser_builder.build(parser_config)
megaparse = MegaParse()
if not file.filename:
raise HTTPFileNotFound("No filename provided")
_, extension = os.path.splitext(file.filename)
Expand Down Expand Up @@ -136,9 +136,7 @@ async def upload_url(
with tempfile.NamedTemporaryFile(delete=False, suffix="pdf") as temp_file:
temp_file.write(response.content)
try:
megaparse = MegaParse(
parser=UnstructuredParser(strategy=StrategyEnum.AUTO)
)
megaparse = MegaParse()
result = await megaparse.aload(temp_file.name)
return {"message": "File parsed successfully", "result": result}
except ParsingException:
Expand Down
17 changes: 12 additions & 5 deletions libs/megaparse/src/megaparse/configs/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ class TextDetConfig(BaseModel):


class AutoStrategyConfig(BaseModel):
auto_page_threshold: float = 0.6
auto_document_threshold: float = 0.2
page_threshold: float = 0.6
document_threshold: float = 0.2


class TextRecoConfig(BaseModel):
Expand All @@ -29,6 +29,14 @@ class DeviceEnum(str, Enum):
COREML = "coreml"


class DoctrConfig(BaseModel):
straighten_pages: bool = False
detect_orientation: bool = False
detect_language: bool = False
text_det_config: TextDetConfig = TextDetConfig()
text_reco_config: TextRecoConfig = TextRecoConfig()


class MegaParseConfig(BaseSettings):
"""
Configuration for Megaparse.
Expand All @@ -41,7 +49,6 @@ class MegaParseConfig(BaseSettings):
extra="ignore",
use_enum_values=True,
)
text_det_config: TextDetConfig = TextDetConfig()
text_reco_config: TextRecoConfig = TextRecoConfig()
auto_parse_config: AutoStrategyConfig = AutoStrategyConfig()
doctr_config: DoctrConfig = DoctrConfig()
auto_config: AutoStrategyConfig = AutoStrategyConfig()
device: DeviceEnum = DeviceEnum.CPU
17 changes: 8 additions & 9 deletions libs/megaparse/src/megaparse/examples/parse_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from megaparse.parser.doctr_parser import DoctrParser
from megaparse.parser.unstructured_parser import UnstructuredParser
from megaparse_sdk.schema.extensions import FileExtension
from megaparse_sdk.schema.parser_config import StrategyEnum
from pydantic import BaseModel, Field


Expand All @@ -22,18 +23,16 @@ class MyCustomFormat(BaseModel):
solution: str = Field(description="The solution statement.")


async def main():
# Parse a file
parser = DoctrParser()
model = ChatOpenAI(name="gpt-4o")
formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat)
def main():
# model = ChatOpenAI(name="gpt-4o")
# formatter_1 = CustomStructuredFormatter(model=model, output_model=MyCustomFormat)

megaparse = MegaParse(ocr_parser=parser, formatters=[formatter_1])
megaparse = MegaParse()

file_path = Path("./tests/pdf/sample_pdf.pdf")
result = await megaparse.aload(file_path=file_path)
file_path = Path("./tests/pdf/native/0168011.pdf")
result = megaparse.load(file_path=file_path)
print(result)


if __name__ == "__main__":
asyncio.run(main())
main()
Loading

0 comments on commit deb8765

Please sign in to comment.