-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy path1.2_Advanced_Chunking.py
260 lines (192 loc) · 8.67 KB
/
1.2_Advanced_Chunking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# Databricks notebook source
# MAGIC %md
# MAGIC # Advanced Parsing & Chunking
# MAGIC
# MAGIC In order to building better query engines we need to improve our extraction and chunking process
# MAGIC - Are we extracting all the information coherently
# MAGIC - Are we splitting the document into sensible chunks?
# MAGIC - What size chunks do we need to make sure that we can fit our model context and provide sufficient extracts?
# MAGIC
# MAGIC There are two steps to the process. Parse and Chunk \
# MAGIC With Parse we need to extract all the text and assocaited metadata that we can \
# MAGIC With Chunk we take the parse and break it down into digestible sections for LLM Promptiog
# MAGIC
# MAGIC The default methods are naive and tend to split just on character limits or words
# MAGIC
# MAGIC We will leverage the library - unstructured but there are many other options out there
# MAGIC
# COMMAND ----------
# MAGIC %sh
# MAGIC # we needed to do this for poppler to work in many cases
# MAGIC apt-get install -y poppler-utils
# COMMAND ----------
# MAGIC %pip install pymupdf4llm databricks-langchain lama_index==0.12.12 langchain==0.3.14 langchain-community==0.3.14 llama-index-llms-langchain llama-index-embeddings-langchain poppler-utils unstructured[pdf,txt]==0.16.5 databricks-vectorsearch llama-index-embeddings-langchain
# MAGIC dbutils.library.restartPython()
# COMMAND ----------
# DBTITLE 1,Setup
# MAGIC %run ./utils
# COMMAND ----------
# DBTITLE 1,Config
import os
from langchain.document_loaders import PyMuPDFLoader
sample_file_to_load = f'/Volumes/{db_catalog}/{db_schema}/{db_volume}/2302.06476.pdf'
print(f'We will use {sample_file_to_load} to review chunking open it alongside to see how different algorithms work')
print(f'You can access it here https://arxiv.org/pdf/2302.06476.pdf')
# COMMAND ----------
# MAGIC %md
# MAGIC # Basic File Loading
# MAGIC We will just use the basic pymupdf loader for this stage. \
# MAGIC The load_and_split function will handle all the config.
# COMMAND ----------
loader = PyMuPDFLoader(sample_file_to_load)
docu_split = loader.load_and_split()
docu_split
# COMMAND ----------
# We can see that the first page has been all concatenated into a page
# If you search for "reasonable performance" you will see that the footer has been merged into the paragraph
Intro = docu_split[0].page_content
Intro
# COMMAND ----------
# IT looks like we have the last bit of a paragraph and the footer from page 1 here
Weird_snippet = docu_split[1].page_content
Weird_snippet
# COMMAND ----------
# Our table has kinda been picked up with \n separations
# We probably want the table with descriptor as one chunk and the rest split out
Table = docu_split[36].page_content
Table
# COMMAND ----------
# MAGIC %md
# MAGIC ## Manually loading and parsing pdf
# MAGIC
# MAGIC Exploring PDF Parse Primitives \
# MAGIC We could experiment with using the raw pdf parse primitives but this will be slow
# COMMAND ----------
import fitz
doc = fitz.open(sample_file_to_load)
for page in doc:
page_dict = page.get_text("dict")
blocks = page_dict["blocks"]
print(blocks)
break
# COMMAND ----------
# MAGIC %md
# MAGIC We can see that the raw PyMuPDF has a lot more information stored on the text block
# MAGIC We have information on location of text,
# COMMAND ----------
# lets see what is in these objects
print(page_dict.keys())
# lets see how many blocks there are:
print(len(page_dict['blocks']))
# lets see what is in a block
print(page_dict['blocks'])
# COMMAND ----------
# Title
page_dict['blocks'][0]
# COMMAND ----------
# First Line authors
page_dict['blocks'][1]
# COMMAND ----------
# 2nd Line authors
page_dict['blocks'][2]
# COMMAND ----------
# The image
page_dict['blocks'][5]
# COMMAND ----------
# MAGIC %md
# MAGIC What will it take to keep the context info and make use of it?
# MAGIC Depending on our docs, we will have to write custom logic to be able to parse and understand the structure of the document
# MAGIC
# MAGIC See [PyMuPDF Docs](https://pymupdf.readthedocs.io/en/latest/tutorial.html) for extra details on how to parse
# MAGIC
# MAGIC Alternative methods:
# MAGIC - Use a document scanning model ie LayoutLM
# MAGIC - Use a PDF to HTML converter then parse the html fligs
# MAGIC - ie \<p>, \<h1>, \<h2> etc each pdf to html converter would work a bit different though....
# MAGIC
# MAGIC With our improved parser, we could then:
# MAGIC - write it as a pyspark pandas_udf, parse the pdf docs to a standard Delta table that we could then embed with Datbricks VectorSearch
# COMMAND ----------
# MAGIC %md
# MAGIC # Advanced Parsing of PDFs
# MAGIC We can try newer more advanced parsers instead of manual coding
# MAGIC
# MAGIC Unstructured is one option. The OSS Unstructured Library has two modes of operation \
# MAGIC A basic parse that reads the raw pdf structure, analyses it for headings, paragraphs etc then tries to group them logically \
# MAGIC An OCR mode that applies a computer vision model to help with data extration.
# MAGIC - nltk is required and libs should be pre-installed
# MAGIC - OCR Extraction will involving correctly setting up Computer Vision (Pytorch) based libraries
# MAGIC See [Unstructured Docs](https://unstructured-io.github.io/unstructured/installation/full_installation.html) for more information on installing
# COMMAND ----------
# MAGIC %md
# MAGIC ## Unstructured PDF Reader
# MAGIC
# MAGIC Lets use the reader on its own first to see what it extracts before we integrate with LangChain to parse
# COMMAND ----------
from unstructured.partition.pdf import partition_pdf
from collections import Counter
# COMMAND ----------
elements = partition_pdf(sample_file_to_load)
# You can see that the underlying structures have been classified
display(Counter(type(element) for element in elements))
# COMMAND ----------
# Authors on the front page are appearing as Title sections.
display(*[(type(element), element.text) for element in elements[0:13]])
# COMMAND ----------
# Sections are being extracted as narrative text
display(*[(type(element), element.text) for element in elements[400:410]])
# COMMAND ----------
# MAGIC %md
# MAGIC # Using Unstructured with Llama_index
# MAGIC
# MAGIC Improving the way we parse by adding more custom logic is one way to better performance \
# MAGIC Compared with 2023, models are increasingly able to digest weird chunks and disjointed paragraphs though. \
# MAGIC
# MAGIC Another route to better performance is to leverage more intelligent structuring of chunks with a library like Llama_index
# MAGIC
# MAGIC Llama Index can structure chunks, `Nodes` in Llama_index jargon so that it has an understanding of their spatial relationships
# MAGIC See: [Llama Index Types](https://docs.llamaindex.ai/en/stable/module_guides/indexing/index_guide/)
# COMMAND ----------
# DBTITLE 1,Setting Up Llama_index default models
from databricks_langchain import ChatDatabricks
from databricks_langchain import DatabricksEmbeddings
from llama_index.core import Settings
from llama_index.llms.langchain import LangChainLLM
from llama_index.embeddings.langchain import LangchainEmbedding
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
import nltk
nltk.download('averaged_perceptron_tagger')
model_name = 'databricks-meta-llama-3-3-70b-instruct'
embedding_model = 'databricks-gte-large-en'
llm_model = ChatDatabricks(
target_uri='databricks',
endpoint = model_name,
temperature = 0.1
)
embeddings = DatabricksEmbeddings(endpoint=embedding_model)
llama_index_chain = LangChainLLM(llm=llm_model)
llama_index_embeddings = LangchainEmbedding(embeddings)
Settings.llm = llama_index_chain
Settings.embed_model = llama_index_embeddings
# COMMAND ----------
# DBTITLE 1,Data loaders
# Note that this can take a while to run as it downloads a computer Vision model
# in case it needs to do OCR Analysis
from llama_index.core import VectorStoreIndex
from pathlib import Path
from llama_index.readers.file.unstructured import UnstructuredReader
unstruct_loader = UnstructuredReader()
unstructured_document = unstruct_loader.load_data(sample_file_to_load)
# COMMAND ----------
# DBTITLE 1,Generate Index
unstructured_index = VectorStoreIndex.from_documents(unstructured_document)
unstructured_query = unstructured_index.as_query_engine()
# COMMAND ----------
# DBTITLE 1,Query
question = 'Are there any weak points in ChatGPT for Zero Shot Learning?'
unstructured_result = unstructured_query.query(question)
print(unstructured_result.response)
# COMMAND ----------
# MAGIC %md Try out other types of indices too and as an extension, see how well it performs with multiple documents \
# MAGIC We have just looked at single document for now, identifying the best document to use in a multi document situation is different
# COMMAND ----------