benhaotang · benhaotang · Feb 18, 2025 · Feb 26, 2025 · Feb 27, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 .vscode/
-.notebook/
+.notebook/
+*.man
diff --git a/docker/agent.py b/docker/agent.py
@@ -0,0 +1,113 @@
+import dspy
+from typing import Optional, AsyncGenerator, List
+import aiohttp
+import configparser
+
+# Load configuration
+config = configparser.ConfigParser()
+config.read('docker/research.config')
+
+USE_OLLAMA = config.getboolean('Settings', 'use_ollama')
+OLLAMA_BASE_URL = config.get('LocalAI', 'ollama_base_url')
+OPENAI_URL = config.get('API', 'openai_url')
+OPENAI_COMPAT_API_KEY = config.get('API', 'openai_compat_api_key')
+DEFAULT_MODEL = config.get('Settings', 'default_model')
+
+# Initialize dspy based on configuration
+if USE_OLLAMA:
+    # Use Ollama for local inference
+    lm = dspy.LM('ollama_chat/' + DEFAULT_MODEL, api_base=OLLAMA_BASE_URL, api_key='')
+else:
+    # Use OpenRouter/OpenAI compatible API
+    lm = dspy.LM('openai/' + DEFAULT_MODEL, api_key=OPENAI_COMPAT_API_KEY, api_base=OPENAI_URL)
+
+dspy.configure(lm=lm)
+
+class ReportInstructionExtraction(dspy.Signature):
+    """
+    Extract components from a natural language instruction for generating a report.
+
+    This extraction includes:
+      - writing_style: instructions for the writing style,
+      - searching_instruction: guidance on how to search,
+      - local_doc_dir: local directory for documents,
+      - online_url_include: list of online URLs to include,
+      - online_url_avoid: list of online URLs to avoid,
+      - main_query: the primary query to search for.
+    """
+    instruction: str = dspy.InputField(desc="A natural language instruction that includes all necessary details")
+    writing_style: str = dspy.OutputField(desc="Writing style instruction (e.g., formal, informal, technical)")
+    searching_instruction: str = dspy.OutputField(desc="Instructions on how to perform the search (e.g., search keywords, filters)")
+    local_doc_dir: str = dspy.OutputField(desc="Local document directory path to be used for reference")
+    online_url_include: List[str] = dspy.OutputField(desc="List of online URLs that must be included in the search")
+    online_url_avoid: List[str] = dspy.OutputField(desc="List of online URLs to avoid in the search")
+    main_query: str = dspy.OutputField(desc="The primary search query")
+
+class WebpageAnalyzer(dspy.Signature):
+    """Analyze webpage content for usefulness and extract relevant information."""
+
+    user_query: str = dspy.InputField()
+    search_query: str = dspy.InputField()
+    page_url: str = dspy.InputField()
+    page_content: str = dspy.InputField()
+
+    is_useful: bool = dspy.OutputField()
+    reason: str = dspy.OutputField(desc="Reasoning for usefulness decision")
+    extracted_context: Optional[str] = dspy.OutputField(desc="Relevant extracted content if page is useful")
+
+async def process_link_dspy(session: aiohttp.ClientSession, link: str, user_query: str, search_query: str,
+                          page_text: str, create_chunk=None) -> AsyncGenerator[tuple[str, str, Optional[str]], None]:
+    """
+    Process a single link using dspy to determine usefulness and extract context in one pass.
+    Returns a generator that yields tuples of (usefulness, reason, context).
+    """
+    try:
+        # Initialize and run the analyzer
+        analyzer = dspy.Predict(WebpageAnalyzer)
+        result = analyzer(
+            user_query=user_query,
+            search_query=search_query,
+            page_url=link,
+            page_content=page_text[:20000]  # Limit content length similar to original
+        )
+
+        usefulness = "Yes" if result.is_useful else "No"
+        reason = result.reason
+        context = None
+
+        if result.is_useful and result.extracted_context:
+            context = result.extracted_context
+
+        yield (usefulness, reason, context)
+
+    except Exception as e:
+        print(f"Error processing {link} with dspy: {e}")
+        yield ("No", f"Error: {str(e)}", None)
+    return
+
+async def is_page_useful_dspy(session: aiohttp.ClientSession, user_query: str, page_text: str, page_url: str) -> tuple[str, str]:
+    """
+    Use dspy to determine if a page is useful and provide reasoning.
+    Returns a tuple of (decision, reasoning).
+    """
+    analyzer = dspy.Predict(WebpageAnalyzer)
+    result = analyzer(
+        user_query=user_query,
+        search_query="",  # Empty since we're only checking usefulness
+        page_url=page_url,
+        page_content=page_text[:20000]
+    )
+    return ("Yes" if result.is_useful else "No", result.reason)
+
+async def extract_report_instructions_async(session: aiohttp.ClientSession, system_message: str) -> Optional[ReportInstructionExtraction]:
+    """
+    Extract report components from a system message using dspy.
+    Returns the extracted ReportInstructionExtraction object or None if extraction fails.
+    """
+    try:
+        extractor = dspy.Predict(ReportInstructionExtraction)
+        result = extractor(instruction=system_message)
+        return result
+    except Exception as e:
+        print(f"Error extracting report instructions: {e}")
+        return None
diff --git a/docker/main.py b/docker/main.py
@@ -16,6 +16,7 @@
 from fastapi.responses import StreamingResponse, JSONResponse
 from pydantic import BaseModel, Field
 from typing import List, Optional, Dict, Any
+from agent import process_link_dspy
 
 # FastAPI app
 app = FastAPI(title="Deep Researcher API")
@@ -91,6 +92,7 @@ class ChatCompletionChunk(BaseModel):
 USE_OLLAMA = config.getboolean('Settings', 'use_ollama')
 USE_JINA = config.getboolean('Settings', 'use_jina')
 WITH_PLANNING = config.getboolean('Settings', 'with_planning')
+USE_DSPY = config.getboolean('Settings', 'use_dspy')
 DEFAULT_MODEL = config.get('Settings', 'default_model')
 REASON_MODEL = config.get('Settings', 'reason_model')
 
@@ -723,18 +725,46 @@ async def process_link(session, link, user_query, search_query, create_chunk=Non
         print(status_msg)
 
     try:
-        # Create fetch task immediately
+        # Create fetch task immediately to maintain concurrency
         fetch_task = asyncio.create_task(fetch_webpage_text_async(session, link))
 
         # Wait for fetch to complete
         page_text = await fetch_task
         if not page_text:
             return
 
-        # Create usefulness task immediately
+        # Use dspy-based processing if enabled
+        if USE_DSPY:
+            # Stream usefulness decision and reasoning first
+            status_msg = f"Processing {link} with dspy...\n\n"
+            if create_chunk:
+                yield create_chunk(status_msg)
+            else:
+                print(status_msg)
+
+            async for result in process_link_dspy(session, link, user_query, search_query, page_text, create_chunk):
+                if isinstance(result, tuple) and len(result) == 3:  # (usefulness, reason, context)
+                    usefulness, reason, context = result
+                    # Show usefulness decision and reasoning
+                    status_msg = f"Page usefulness for {link}: {usefulness}\nReason: {reason}\n\n"
+                    if create_chunk:
+                        yield create_chunk(status_msg)
+                    else:
+                        print(status_msg)
+
+                    # If useful, show and yield the context
+                    if usefulness == "Yes" and context:
+                        status_msg = f"Extracted context from {link} (first 200 chars): {context[:200]}\n\n"
+                        if create_chunk and VERBOSE_WEB_PARSE:
+                            yield create_chunk(status_msg)
+                        else:
+                            print(status_msg)
+                        context_with_url = "url:" + link + "\ncontext:" + context
+                        yield context_with_url
+            return
+
+        # Original processing flow
         usefulness_task = asyncio.create_task(is_page_useful_async(session, user_query, page_text, link))
-
-        # Create context task but don't await it yet
         context_task = asyncio.create_task(extract_relevant_context_async(session, user_query, search_query, page_text, link))
 
         # Wait for usefulness check and stream its result

diff --git a/docker/research.config b/docker/research.config
@@ -14,6 +14,7 @@ searxng_url = http://localhost:4000/search
 use_jina = true
 use_ollama = false
 with_planning = true
+use_dspy = false
 default_model = anthropic/claude-3.5-haiku
 reason_model = deepseek/deepseek-r1-distill-qwen-32b