aigc-apps · moria97 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -34,11 +34,20 @@ jobs:
           username: ${{ secrets.ACR_USER }}
           password: ${{ secrets.ACR_PASSWORD }}
 
-      - name: Build and push image
+      - name: Build and push base image
         env:
           IMAGE_TAG: 0.0.1
         run: |
           docker build -t ${{ env.REGISTRY }}/mybigpai/pairag:$IMAGE_TAG .
           docker tag ${{ env.REGISTRY }}/mybigpai/pairag:$IMAGE_TAG ${{ env.REGISTRY_HZ }}/mybigpai/pairag:$IMAGE_TAG
           docker push ${{ env.REGISTRY }}/mybigpai/pairag:$IMAGE_TAG
           docker push ${{ env.REGISTRY_HZ }}/mybigpai/pairag:$IMAGE_TAG
+
+      - name: Build and push GPU image
+        env:
+          IMAGE_TAG: 0.0.1_gpu
+        run: |
+          docker build -t ${{ env.REGISTRY }}/mybigpai/pairag:$IMAGE_TAG -f Dockerfile_gpu .
+          docker tag ${{ env.REGISTRY }}/mybigpai/pairag:$IMAGE_TAG ${{ env.REGISTRY_HZ }}/mybigpai/pairag:$IMAGE_TAG
+          docker push ${{ env.REGISTRY }}/mybigpai/pairag:$IMAGE_TAG
+          docker push ${{ env.REGISTRY_HZ }}/mybigpai/pairag:$IMAGE_TAG
diff --git a/Dockerfile_gpu b/Dockerfile_gpu
@@ -0,0 +1,26 @@
+FROM python:3.10-slim AS builder
+
+RUN pip3 install poetry
+
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+
+WORKDIR /app
+COPY . .
+RUN mv pyproject_gpu.toml pyproject.toml \
+    && rm poetry.lock
+
+RUN poetry install && rm -rf $POETRY_CACHE_DIR
+
+FROM python:3.10-slim AS prod
+ENV VIRTUAL_ENV=/app/.venv \
+    PATH="/app/.venv/bin:$PATH"
+
+RUN apt-get update && apt-get install -y libgl1 libglib2.0-0
+
+WORKDIR /app
+COPY . .
+COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
+ENTRYPOINT ["pai_rag", "run"]
diff --git a/pyproject_gpu.toml b/pyproject_gpu.toml
@@ -0,0 +1,75 @@
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry]
+name = "pai_rag"
+version = "0.1.0"
+description = "Open source RAG framework built on Aliyun PAI"
+authors = []
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = ">=3.10.0,<3.12"
+fastapi = "^0.110.1"
+uvicorn = "^0.29.0"
+llama-index-core = ">=0.10.29,<=0.10.39"
+llama-index-embeddings-openai = "^0.1.7"
+llama-index-embeddings-azure-openai = "^0.1.7"
+llama-index-embeddings-dashscope = "^0.1.3"
+llama-index-llms-openai = "^0.1.15"
+llama-index-llms-azure-openai = "^0.1.6"
+llama-index-llms-dashscope = "^0.1.2"
+llama-index-readers-database = "^0.1.3"
+llama-index-vector-stores-chroma = "^0.1.6"
+llama-index-vector-stores-faiss = "^0.1.2"
+llama-index-vector-stores-analyticdb = "^0.1.1"
+llama-index-vector-stores-elasticsearch = "^0.2.0"
+llama-index-vector-stores-milvus = "^0.1.10"
+gradio = "3.41.0"
+faiss-cpu = "^1.8.0"
+hologres-vector = "^0.0.9"
+dynaconf = "^3.2.5"
+docx2txt = "^0.8"
+click = "^8.1.7"
+pydantic = "^2.7.0"
+pytest = "^8.1.1"
+llama-index-retrievers-bm25 = "^0.1.3"
+jieba = "^0.42.1"
+llama-index-embeddings-huggingface = "^0.2.0"
+llama-index-postprocessor-flag-embedding-reranker = "^0.1.3"
+flagembedding = "^1.2.10"
+sentencepiece = "^0.2.0"
+oss2 = "^2.18.5"
+asgi-correlation-id = "^4.3.1"
+openinference-instrumentation-llama-index = "1.3.0"
+torch = "2.2.2"
+torchvision = "0.17.2"
+openpyxl = "^3.1.2"
+pdf2image = "^1.17.0"
+llama-index-storage-chat-store-redis = "^0.1.3"
+easyocr = "^1.7.1"
+opencv-python = "^4.9.0.80"
+llama-parse = "0.4.2"
+pypdf2 = "^3.0.1"
+pdfplumber = "^0.11.0"
+pdfminer-six = "^20231228"
+openinference-semantic-conventions = "0.1.6"
+llama-index-tools-google = "^0.1.5"
+llama-index-tools-duckduckgo = "^0.1.1"
+openinference-instrumentation = "^0.1.7"
+llama-index-llms-huggingface = "^0.2.0"
+pytest-asyncio = "^0.23.7"
+pytest-cov = "^5.0.0"
+xlrd = "^2.0.1"
+markdown = "^3.6"
+chardet = "^5.2.0"
+
+[tool.poetry.scripts]
+pai_rag = "pai_rag.main:main"
+load_data = "pai_rag.data.rag_datapipeline:run"
+load_easyocr_model = "pai_rag.utils.download_easyocr_models:download_easyocr_models"
+evaluation = "pai_rag.evaluations.batch_evaluator:run"
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
diff --git a/src/pai_rag/app/web/view_model.py b/src/pai_rag/app/web/view_model.py
@@ -298,7 +298,7 @@ def to_app_config(self):
             config["postprocessor"]["rerank_model"] = "bge-reranker-large"
         else:
             config["postprocessor"]["rerank_model"] = "no-reranker"
-        config["postprocessor"]["top_n"] = 3
+        config["postprocessor"]["top_n"] = self.similarity_top_k
 
         config["synthesizer"]["type"] = self.synthesizer_type
         config["synthesizer"]["text_qa_template"] = self.text_qa_template

diff --git a/src/pai_rag/data/rag_dataloader.py b/src/pai_rag/data/rag_dataloader.py
@@ -60,6 +60,8 @@ def _extract_file_type(self, metadata: Dict[str, Any]):
     async def load(self, file_directory: str, enable_qa_extraction: bool):
         data_reader = self.datareader_factory.get_reader(file_directory)
         docs = data_reader.load_data()
+        logger.info(f"[DataReader] Loaded {len(docs)} docs.")
+
         nodes = []
 
         doc_cnt_map = {}
@@ -78,6 +80,8 @@ async def load(self, file_directory: str, enable_qa_extraction: bool):
             else:
                 nodes.extend(self.node_parser.get_nodes_from_documents([doc]))
 
+        logger.info(f"[DataReader] Split into {len(nodes)} nodes.")
+
         # QA metadata extraction
         if enable_qa_extraction:
             qa_nodes = []
@@ -103,6 +107,8 @@ async def load(self, file_directory: str, enable_qa_extraction: bool):
                 node.excluded_llm_metadata_keys.append("question")
             nodes.extend(qa_nodes)
 
+        logger.info("[DataReader] Start inserting to index.")
+
         self.index.insert_nodes(nodes)
         self.index.storage_context.persist(persist_dir=store_path.persist_path)
         logger.info(f"Inserted {len(nodes)} nodes successfully.")

diff --git a/src/pai_rag/integrations/readers/pai_csv_reader.py b/src/pai_rag/integrations/readers/pai_csv_reader.py
@@ -141,7 +141,7 @@ def load_data(
             with fs.open(file) as f:
                 encoding = chardet.detect(f.read(100000))["encoding"]
                 f.seek(0)
-                if encoding.upper() in ["GB18030", "GBK"]:
+                if "GB" in encoding.upper():
                     self._pandas_config["encoding"] = "GB18030"
                 try:
                     df = pd.read_csv(f, **self._pandas_config)

diff --git a/src/pai_rag/modules/embedding/embedding.py b/src/pai_rag/modules/embedding/embedding.py
@@ -45,9 +45,15 @@ def _create_new_instance(self, new_params: Dict[str, Any]):
         elif source == "huggingface":
             model_dir = config.get("model_dir", DEFAULT_MODEL_DIR)
             model_name = config.get("model_name", DEFAULT_HUGGINGFACE_EMBEDDING_MODEL)
+            embed_batch_size = config.get("embed_batch_size", DEFAULT_EMBED_BATCH_SIZE)
+
             model_path = os.path.join(model_dir, model_name)
-            embed_model = HuggingFaceEmbedding(model_name=model_path)
-            logger.info("Initialized HuggingFace embedding model.")
+            embed_model = HuggingFaceEmbedding(
+                model_name=model_path, embed_batch_size=embed_batch_size
+            )
+            logger.info(
+                f"Initialized HuggingFace embedding model {model_name} with {embed_batch_size} batch size."
+            )
 
         elif source == "dashscope":
             embed_model = DashScopeEmbedding(