Update docs

snexus · Oct 8, 2024 · d2453ba · d2453ba
1 parent f8b1350
commit d2453ba
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -16,6 +16,8 @@ The purpose of this package is to offer a convenient question-answering (RAG) sy
     * Other common formats are supported by `Unstructured` pre-processor:
         * List of formats see [here](https://unstructured-io.github.io/unstructured/core/partition.html).
 
+* Support for table parsing via open-source gmft (https://github.com/conjuncts/gmft) or Azure Document Intelligence.
+
 * Supports multiple collection of documents, and filtering the results by a collection.
 
 * An ability to update the embeddings incrementally, without a need to re-index the entire document base.

diff --git a/docs/index.rst b/docs/index.rst
@@ -19,6 +19,8 @@ Features
     * Other common formats are supported by `Unstructured` pre-processor:
         * List of formats https://unstructured-io.github.io/unstructured/core/partition.html
 
+* Support for table parsing via open-source gmft (https://github.com/conjuncts/gmft) or Azure Document Intelligence.
+
 * Supports multiple collection of documents, and filtering the results by a collection.
 
 * An ability to update the embeddings incrementally, without a need to re-index the entire document base.

diff --git a/sample_templates/generic/config_template.yaml b/sample_templates/generic/config_template.yaml
@@ -31,6 +31,19 @@ embeddings:
         merge_sections: False # Merge # headings if possible, can be turned on and off depending on document stucture
         remove_images: True # Remove image links
 
+    # Optional setting
+    pdf_table_parser: gmft # azuredoc
+
+    # Optional setting
+    pdf_image_parser:
+        image_parser: gemini-1.5-pro # gemini-1.5-flash
+        system_instructions: |
+            You are an research assistant. You analyze the image to extract detailed information. Response must be a Markdown string in the follwing format:
+            - First line is a heading with image caption, starting with '# '
+            - Second line is empty
+            - From the third line on - detailed data points and related metadata, extracted from the image, in Markdown format. Don't use Markdown tables.
+
+    
     passage_prefix: "passage: " # Often, specific prefix needs to be included in the source text, for embedding models to work properly
     label: "documment-collection-1" # Add a label to the current collection
 

diff --git a/sample_templates/test-templates/pdf_library.yaml b/sample_templates/test-templates/pdf_library.yaml
@@ -17,7 +17,7 @@ embeddings:
       - epub
       - md
       - pdf
-    pdf_table_parser: azuredoc # gmft
+    pdf_table_parser: azuredoc # gmft # azuredoc # gmft
     # pdf_image_parser:
     #     image_parser: gemini-1.5-pro # gemini-1.5-flash
     #     system_instructions: |