diff --git a/bifrost/app/blog/blogs/ai-agent-builders/metadata.json b/bifrost/app/blog/blogs/ai-agent-builders/metadata.json index 08d46fe5ab..b8aaece532 100644 --- a/bifrost/app/blog/blogs/ai-agent-builders/metadata.json +++ b/bifrost/app/blog/blogs/ai-agent-builders/metadata.json @@ -1,8 +1,8 @@ { - "title": "6 Awesome Platforms & Frameworks for Building AI Agents (Open-Source & More)", - "title1": "6 Awesome Platforms & Frameworks for Building AI Agents (Open-Source & More)", - "title2": "6 Awesome Platforms & Frameworks for Building AI Agents (Open-Source & More)", - "description": "Today, we are covering 6 of our favorite platforms for building AI agents — whether you need complex multi-agent systems or a simple no-code solution. ", + "title": "7 Awesome Platforms & Frameworks for Building AI Agents (Open-Source & More)", + "title1": "7 Awesome Platforms & Frameworks for Building AI Agents (Open-Source & More)", + "title2": "7 Awesome Platforms & Frameworks for Building AI Agents (Open-Source & More)", + "description": "Today, we are covering 7 of our favorite platforms for building AI agents — whether you need complex multi-agent systems or a simple no-code solution. ", "images": "/static/blog/ai-agent-builders/cover.webp", "time": "12 minute read", "author": "Lina Lam", diff --git a/bifrost/app/blog/blogs/ai-agent-builders/src.mdx b/bifrost/app/blog/blogs/ai-agent-builders/src.mdx index da00e215d8..32c13e06f1 100644 --- a/bifrost/app/blog/blogs/ai-agent-builders/src.mdx +++ b/bifrost/app/blog/blogs/ai-agent-builders/src.mdx @@ -2,32 +2,32 @@ Lately, there's been a lot of chatter about AI agents—some enthusiastic, some ![Open-Source AI Agent Builder Cover](/static/blog/ai-agent-builders/cover.webp) -### AI Agents — Hype vs. Reality +## AI Agents — Hype vs. Reality For some, AI agents are groundbreaking ways to solve big problems by **breaking them down into smaller, manageable tasks**. AI Agents are particularly great at handling repetitive or complex work. Critics, on the other hand, often imagine **“Super Agents”** that could fully replace humans—but we're not quite there yet. However, this doesn't mean that AI agents aren’t incredibly useful today. Many platforms are already making meaningful strides in building robust, practical AI agents that empower teams, streamline processes, and unlock new possibilities across industries. -Today, we are covering our favorite **open-source frameworks & platforms for building AI agents, how they compare, and our recommendations for different use cases** — whether you need complex multi-agent systems or a simple no-code solution, these will help you get started. +Today, we are covering our favorite **open-source frameworks & platforms** for building AI agents, how they compare, and our recommendations for different use cases — whether you need complex multi-agent systems or a simple no-code solution, these will help you get started. 1. Dify `freemium` `no-code` `open-source` 2. AutoGen `free` `open-source` 3. LlamaIndex `freemium` `open-source` 4. LangChain `freemium` `open-source` -5. crewAI `free` `waitlist` `open-source` -6. (Bonus) Wordware `freemium` `not open-source` +5. crewAI `freemium` `open-source` +6. Pydantic AI `free` `open-source` +7. (Bonus) Wordware `freemium` `not open-source` --- -### Plug 🧊 - -Once you build an AI agent, how do you know if it’s performing well? Introducing **Helicone's Sessions feature** to help you analyze complex multi-step workflows. - -**Watch:** [Trace and debug your AI agents with Sessions](https://www.linkedin.com/posts/colegottdank_re-launching-helicone-day-23-sessions-activity-7231793935285714944-bKI8/). - -Helicone has integrations with many platforms we will mention soon. Try it, and let us know what you think! - ---- + ## First, what are AI agents anyway? @@ -61,7 +61,7 @@ Platforms like **Perplexity** and **Dify: The No-Code Platform for Building Agents **Best for**: Rapid prototyping and non-technical team collaboration @@ -71,7 +71,9 @@ Platforms like **Perplexity** and **Dify with CrewAI. + --- -### 2. [AutoGen](https://microsoft.github.io/autogen/): The Multitasking Framework +## 2. AutoGen: The Multitasking Framework **Best for**: Building complex, multi-agent systems with high customization needs @@ -120,9 +124,11 @@ AutoGen is Microsoft’s open-source framework providing developers with tools t | • **Deep NLP Integration**: Flexibility in integrating and fine-tuning various LLMs. | • **Dependency on input quality**: The output is contingent on the quality of input data. To get optimal results, users must meticulously choose and verify their inputs. | | • **Scalability**: Integrates with cloud services like Azure for handling large-scale operations. | | +Compare AutoGen with CrewAI. + --- -### 3. [LlamaIndex](https://www.llamaindex.ai/): Build Your Ideal AI Team +## 3. LlamaIndex: Build Your Ideal AI Team **Best for**: Data-intensive applications requiring efficient indexing and retrieval @@ -148,13 +154,15 @@ AutoGen is Microsoft’s open-source framework providing developers with tools t | • **Focused on indexing and efficient retrieval**: its advanced indexing techniques ensure fast and accurate retrieval of relevant information. | • **Limited functionality:** Primarily focused on data retrieval, less suitable for complex NLP tasks. | | • **Customizable workflows:** tailor the RAG process to specific use cases, including model options, retrieval strategy, etc. | | +Compare LlamaIndex with LangChain. + --- -### 4. [crewAI](https://www.crewai.com/): Build Your Ideal AI Team +## 4. crewAI: Build Your Ideal AI Team **Best for**: Creating role-based AI agent teams for specific workflows -`free` `waitlist` `open-source` +`freemium` `open-source` ![crewAI: AI Agent Builder](/static/blog/ai-agent-builders/crewai.webp) @@ -180,9 +188,11 @@ A structured platform for creating and managing AI agents, allowing users to def | • **Quick prototype & deployment:** fast set-up and highly modular design that lets you test complex agent interaction and deploy them easily. | • **Lack advanced features:** Less suited for highly specialized or computation-heavy tasks. | | • **Integration with LangChain**: Integrates well with LangChain and its related tools. | • **Limited customization:** Less flexibility compared to more developer-centric platforms. | +Compare CrewAI vs. AutoGen or CrewAI vs. Dify. + --- -### 5. [LangChain](https://www.langchain.com/): A Modular Approach to AI Development +## 5. LangChain: A Modular Approach to AI Development **Best for**: Modular, flexible AI application development @@ -210,9 +220,41 @@ LangChain is an open-source framework designed for building applications powered | • **Scalability:** Useful for prototyping complex AI solutions and large datasets efficiently. | • **Poor documentation:** often outdated or unclear, however, has a large active community for support. | | • **Integrations:** Has many integrations which requires more coding effort, but offers flexibility for custom models. | • **Not suitable for production environments**: due to instability and frequent changes. | +Compare LangChain vs. LlamaIndex. + +--- + +## 6. Pydantic AI: Simplify Building AI Apps for Production + +**Best for**: production-grade AI apps that require structured output and type safety. + +`free` `open-source` + +![Pydantic AI: AI Agent Builder](/static/blog/ai-agent-builders/pydantic-ai.webp) + +[![Pydantic AI](https://img.shields.io/github/stars/pydantic/pydantic-ai.svg?stylAe=social)](https://github.com/pydantic/pydantic-ai) + +Pydantic AI is a powerful Python agent framework that simplifies the development of production-grade applications with Generative AI. Pydantic is created by the team behind the popular Pydantic data validation library. As an open-source project, it follows the same model as the core Pydantic library, allowing developers to use it freely for both personal and commercial projects. + +### Key features of Pydantic AI + +- 🔄 **Model-Agnostic:** Pydantic AI supports various models, including OpenAI, Anthropic, Gemini, Ollama, Groq, and Mistral, with the flexibility to implement support for other models. +- 🚀 **Ease of Use:** Creating an agent with Pydantic AI requires just a few lines of code to define an agent with a specified model and system prompt. +- 🏗️ **Structured Responses:** The framework leverages Pydantic's capabilities to validate and structure model outputs, making sure the output is consistent across runs. +- 📊 **Observability:** Pydantic AI integrates seamlessly with Pydantic Logfire for real-time debugging, usage and cost monitoring, and behavior tracking of AI applications. +- ✅ **Type Safety:** Pydantic AI works well with static type checkers like mypy and pyright, which improves code reliability and maintainability. + +### Pros and Cons of Pydantic AI + +| **Pros** | **Cons** | +| ------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| • **Built on Vanilla Python:** full control and visibility into the agent's inner workings. | • **Steep learning curve:** While designed for ease of use, developers still need to familiarize themselves with Pydantic's specific syntax and behavior. | +| • **Cost Tracking:** built-in monitoring of token usage and associated costs. | • **Exception Handling:** Some developers find that Pydantic's exceptions can sometimes be difficult to debug. | +| • **Function Calling and Structured Output:** Excels in generating structured output and defining object models for agent responses. | • **Dependency on External Library:** Using Pydantic AI introduces a dependency on an external library, and may not be desirable for all projects. | + --- -### 6. [Wordware](https://www.wordware.ai/): The Versatile AI Toolkit +## 7. Wordware: The Versatile AI Toolkit **Best for**: Rapid AI agent development with collaborative features @@ -243,41 +285,72 @@ Wordware is an AI toolkit designed to streamline the creation and deployment (vi ## Choosing the Right Platform -**Dify, AutoGen, LlamaIndex, crewAI, LangChain, Wordware — which is better for your use case?** We have a short and long answer. +**Dify, AutoGen, LlamaIndex, crewAI, LangChain, Pydantic AI, Wordware — which is better for your use case?** We have a short and long answer. ### The short answer -| Criteria | Dify | AutoGen | LlamaIndex | crewAI | LangChain | Wordware | -| ------------------- | ------ | ------- | ---------- | ------ | --------- | -------- | -| Rapid development | ✔️✔️✔️ | ✔️ | ✔️✔️ | ✔️✔️✔️ | ✔️✔️ | ✔️✔️✔️ | -| Deep customization | ✔️ | ✔️✔️✔️ | ✔️✔️ | ✔️✔️ | ✔️✔️✔️ | ✔️✔️ | -| Data integration | ✔️✔️ | ✔️✔️ | ✔️✔️✔️ | ✔️ | ✔️✔️ | ✔️✔️ | -| Scalability | ✔️✔️ | ✔️✔️✔️ | ✔️✔️✔️ | ✔️✔️ | ✔️✔️ | ✔️✔️ | -| Ease of use | ✔️✔️✔️ | ✔️ | ✔️ | ✔️✔️ | ✔️ | ✔️✔️✔️ | -| Multi-agent support | ✔️ | ✔️✔️✔️ | ✔️ | ✔️✔️✔️ | ✔️✔️ | ✔️ | -| Code execution | ✔️ | ✔️✔️✔️ | ✔️ | ✔️ | ✔️✔️ | ✔️✔️ | -| Community support | ✔️✔️ | ✔️✔️ | ✔️✔️ | ✔️ | ✔️✔️✔️ | ✔️ | +| Criteria | Dify | AutoGen | LlamaIndex | crewAI | LangChain | Pydantic | Wordware | +| ------------------- | ------ | ------- | ---------- | ------ | --------- | -------- | -------- | +| Rapid development | ✔️✔️✔️ | ✔️ | ✔️✔️ | ✔️✔️✔️ | ✔️✔️ | ✔️✔️✔️ | ✔️✔️✔️ | +| Deep customization | ✔️ | ✔️✔️✔️ | ✔️✔️ | ✔️✔️ | ✔️✔️✔️ | ✔️✔️✔️ | ✔️✔️ | +| Data integration | ✔️✔️ | ✔️✔️ | ✔️✔️✔️ | ✔️ | ✔️✔️ | ✔️✔️✔️ | ✔️✔️ | +| Scalability | ✔️✔️ | ✔️✔️✔️ | ✔️✔️✔️ | ✔️✔️ | ✔️✔️ | ✔️✔️ | ✔️✔️ | +| Ease of use | ✔️✔️✔️ | ✔️ | ✔️ | ✔️✔️ | ✔️ | ✔️✔️✔️ | ✔️✔️✔️ | +| Multi-agent support | ✔️ | ✔️✔️✔️ | ✔️ | ✔️✔️✔️ | ✔️✔️ | ✔️ | ✔️ | +| Code execution | ✔️ | ✔️✔️✔️ | ✔️ | ✔️ | ✔️✔️ | ✔️✔️ | ✔️✔️ | +| Community support | ✔️✔️ | ✔️✔️ | ✔️✔️ | ✔️ | ✔️✔️✔️ | ✔️ | ✔️ | + + ### The long answer… Picking the right AI agent platform comes down to understanding your needs and matching them with the right tools. Consider the following factors: -- **For rapid prototyping**, go for **Dify** or **Wordware** for their no-code/low-code platform. -- **For deep customization**, go for **AutoGen** or **LangChain** for their extensive customization capabilities. +- **For rapid prototyping**, go for **Dify**, **Pydantic AI** or **Wordware** for their no-code/low-code platform. +- **For deep customization**, **AutoGen**, **LangChain** and **Pydantic AI** are strong choices for their extensive customization options. Especially Pydantic AI with its vanilla Python approach. - **For teams with mixed technical backgrounds**, **Dify** is ideal due to its user-friendly interface for both developers and non-technical users. -- **For highly technical teams**, **AutoGen** or **LangChain** may be better choices as they provide the depth and flexibility advanced developers often require. -- **If your project involves complex data integration**, **LlamaIndex** does well for handling diverse data sources and efficient retrieval. -- **For projects focused on language processing**, **LangChain** offers more robust NLP capabilities and integrations with various language models. -- **For enterprise-grade scalability**, **AutoGen** integrates well with cloud services for large-scale operations. +- **For highly technical teams**, **AutoGen**, **LangChain** and **Pydantic AI** may be better choices as they provide the depth and flexibility advanced developers need. +- **If your project involves complex data integration**, **LlamaIndex** does well for handling diverse data sources and efficient retrieval. **Pydantic AI** is also a great option that supports robust data validation and structuring. +- **For projects focused on language processing**, both **LangChain** and **Pydantic AI** support various language models, including those specialized in NLP tasks. +- **For enterprise-grade scalability**, **AutoGen** and **Pydantic AI** are strong choices as they integrate well with cloud services. However all the tools mentioned can be integrated with **Helicone** for observability and monitoring. - **For efficient handling of large datasets**, **LlamaIndex** provides optimized indexing and retrieval mechanisms. - **For building conversational AI or chatbots**, **Dify** or **LangChain** have strong support for dialogue management and NLP tasks. -- **For data analysis and research applications**, **LlamaIndex** or **AutoGen** will be better suited for their robust data processing and analysis capabilities. +- **For data analysis and research applications**, **LlamaIndex**, **AutoGen** and **Pydantic AI** will be better suited for their robust data processing and analysis capabilities. - **For workflow automation with distinct AI roles**, **crewAI** specializes in creating teams of AI agents with defined roles and collaboration patterns. Don’t forget to consider the learning curves of each platform and the kind of support available, especially if your team is new to AI development. --- -### Questions or feedback? - -Are the information out of date? Do you have additional platforms to add? Please raise an issue and we’d love to share your insights! +## You might find these useful: + +- + Optimizing AI Agents: Replaying LLM Sessions to Improve Performance + +- + Debugging RAG Chatbot and AI Agents with Sessions + +- + The Emerging LLM Stack + + + diff --git a/bifrost/app/blog/blogs/ai-best-practices/src.mdx b/bifrost/app/blog/blogs/ai-best-practices/src.mdx index af5d14bc72..394f3526fa 100644 --- a/bifrost/app/blog/blogs/ai-best-practices/src.mdx +++ b/bifrost/app/blog/blogs/ai-best-practices/src.mdx @@ -124,16 +124,12 @@ LLMs can be manipulated into convincing the user to input sensitive information, **How Helicone can help you:** -Helicone provides moderation and LLM security features to help you check whether the user message is potentially harmful, and enhance OpenAI chat completions with automated security checks, which include user messages for threads, block injection threats and threat details back to you. +Helicone provides moderation and LLM security features to help you check whether the user message is potentially harmful, and enhance OpenAI chat completions with automated security checks, which include user messages for threads, block injection threats and threat details back to you. --- -### Conclusion +## Bottom Line Keeping your AI app reliable hinges on effective observability and performance monitoring. This means defining important performance metrics, setting up thorough logging, monitoring your outputs regularly, and ensuring safety and security measures are in place. By following these best practices, you can boost the performance and reliability of your LLM deployments and accelerate your AI development. ---- - -### Try Helicone for free. - - Sign up or contact us. + diff --git a/bifrost/app/blog/blogs/ai-safety/src.mdx b/bifrost/app/blog/blogs/ai-safety/src.mdx index 2fbff8c61b..35d55b8038 100644 --- a/bifrost/app/blog/blogs/ai-safety/src.mdx +++ b/bifrost/app/blog/blogs/ai-safety/src.mdx @@ -33,3 +33,5 @@ Last but not least, we are actively developing Drift Detection. Understanding th ## Ethical Observability as a Cornerstone for the Future In a world increasingly dependent on AI, it's crucial that we deploy these powerful technologies responsibly and ethically. Helicone addresses this need by providing not just robust performance tracking but also specialized features for ethical observability, including Two-Way Door Auditing, Segmentation and ETL, and Drift Detection. These features serve as ethical cornerstones, ensuring that AI systems adhere to legal standards and societal values. As we navigate the complexities of AI integration into various sectors, tools like Helicone become not just an operational necessity but a social imperative, empowering organizations to deploy AI both efficiently and ethically. + + diff --git a/bifrost/app/blog/blogs/autoGPT/src.mdx b/bifrost/app/blog/blogs/autoGPT/src.mdx index cd25226462..aa86a6b40b 100644 --- a/bifrost/app/blog/blogs/autoGPT/src.mdx +++ b/bifrost/app/blog/blogs/autoGPT/src.mdx @@ -1,12 +1,11 @@ - -AutoGPT is diligently developing their [Auto-GPT-Benchmarks repository](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks). Their goal? To construct the optimal evaluation pipeline for comparing different agents. +AutoGPT is diligently developing their Auto-GPT-Benchmarks repository. Their goal? To construct the optimal evaluation pipeline for comparing different agents. ![AutoGPT x Helicone: Optimizing Evaluation Pipelines](/static/blog/autogpt.webp) AutoGPT is fully leveraging the capabilities of Helicone without modifying a single line of code. Here are the key features that facilitate this synergy: -- **Proxy Integration:** Helicone's role as a proxy allows AutoGPT to maintain their codebase intact. Learn more about this feature in our [MITM Proxy documentation](https://docs.helicone.ai/tools/mitm-proxy). -- **Caching:** For minor code modifications that don't necessitate re-calling the LLM for an entire CI pipeline, requests can be cached on edge servers. This feature saves AutoGPT over $10 per PR! You can read more about this in our [Caching documentation](https://docs.helicone.ai/features/advanced-usage/caching). +- **Proxy Integration:** Helicone's role as a proxy allows AutoGPT to maintain their codebase intact. Learn more about this feature in our MITM Proxy documentation. +- **Caching:** For minor code modifications that don't necessitate re-calling the LLM for an entire CI pipeline, requests can be cached on edge servers. This feature saves AutoGPT over $10 per PR! You can read more about this in our Caching documentation. - **GraphQL:** Our data extraction API enables AutoGPT to generate custom reports upon the completion of a CI job. ## AutoGPT's Workflow with Helicone @@ -22,9 +21,7 @@ bash -c "$(curl -fsSL https://raw.githubusercontent.com/Helicone/helicone/main/m ``` -Within the benchmarks, AutoGPT implemented a Python library where they could set specific custom properties for detailed measurements, as shown here. You can learn more about this in our [Custom Properties documentation](https://docs.helicone.ai/features/advanced-usage/custom-properties). - - +Within the benchmarks, AutoGPT implemented a Python library where they could set specific custom properties for detailed measurements, as shown here. You can learn more about this in our Custom Properties documentation. ```python HeliconeLockManager.write_custom_property("job_id", "1") @@ -39,41 +36,36 @@ export HELICONE_CACHE_ENABLED="true" The total integration process required at most 5 lines of code, which enabled AutoGPT to immediately get rich dashboards and save costs on their CI jobs. - ### Data Ingest AutoGPT can track how different agents are impacting their costs. ![Agent Comparisons](/static/blog/agentComparisons.webp) -*AutoGPT's agent comparison dashboard* +_AutoGPT's agent comparison dashboard_ If they wish to examine specific requests, they can do this by using the filter feature. ![Agent Filters](/static/blog/agentFilters.webp) -*Filtering feature for examining specific requests* - +_Filtering feature for examining specific requests_ ### Determining Cost Savings For scenarios where testing the agent's functionality is needed but calling the API is not, such as small code changes, they can monitor their cache usage effortlessly through the dashboards. Here's an example: - ![Cache Page Stats](/static/blog/cachePageStats.webp) -*Dashboard showing cache usage statistics* +_Dashboard showing cache usage statistics_ We also maintain a log of each cached request, ensuring that caching is effective and marking agents as "Cacheable Agents". ![Cache Request Table](/static/blog/cacheRequestTable.webp) -*Log of each cached request* - +_Log of each cached request_ ## The Road Ahead We are currently developing a suite of GraphQL endpoints that will allow AutoGPT to easily ingest some of their data and add it directly to the reports after a run. ![GraphQL](/static/blog/graphQL.webp) -*GraphQL endpoints in development* - +_GraphQL endpoints in development_ This development is being paired with deep links so that we can have a tight integration between report generation and Helicone. Here is a preview of what a benchmark report will look like: @@ -87,7 +79,7 @@ Challenge go-to-market Number of OpenAI calls: 231 Total Cache hits: 231 Cache $ saved: $231 - Link: https://helicone.ai/requests?propertyFilters=%5B%7B%22key%22%3A%22challenge%22%2C%22value%22%3A%22got-to-market%22%7D%5D + Link: https://helicone.ai/requests?propertyFilters=%5B%7B%22key%22%3A%22challenge%22%2C%22value%22%3A%22got-to-market%22%7D%5D Model breakdown | | gpt4 | claude | gpt3.5 | @@ -103,7 +95,7 @@ Challenge send-email Number of OpenAI calls: 231 Total Cache hits: 231 Cache $ saved: $231 - Link: https://helicone.ai/requests?propertyFilters=%5B%7B%22key%22%3A%22challenge%22%2C%22value%22%3A%22send-email%22%7D%5D + Link: https://helicone.ai/requests?propertyFilters=%5B%7B%22key%22%3A%22challenge%22%2C%22value%22%3A%22send-email%22%7D%5D Model breakdown | | gpt4 | claude | gpt3.5 | @@ -125,4 +117,6 @@ Challenge send-email ## Thank You for Reading! -We appreciate your time in reading our first blog post. We are excited to be partnering with AutoGPT to enable rich logging for them and deliver value using Helicone. If you are interested in learning more about Helicone or would like to meet the team, please email me at justin@helicone.ai or join our discord! \ No newline at end of file +We appreciate your time in reading our first blog post. We are excited to be partnering with AutoGPT to enable rich logging for them and deliver value using Helicone. If you are interested in learning more about Helicone or would like to meet the team, please email me at justin@helicone.ai or join our discord! + + diff --git a/bifrost/app/blog/blogs/best-arize-alternatives/src.mdx b/bifrost/app/blog/blogs/best-arize-alternatives/src.mdx index 25012ab5c6..616e1d2e63 100644 --- a/bifrost/app/blog/blogs/best-arize-alternatives/src.mdx +++ b/bifrost/app/blog/blogs/best-arize-alternatives/src.mdx @@ -1,10 +1,10 @@ ![Arize AI vs. Helicone, which one is better?](/static/blog/arize-alternatives/helicone-vs-arize.webp) -### Introduction +## Introduction As the adoption of Large Language Models (LLMs) continues to grow, the need for robust observability tools has become paramount. These tools help developers and data scientists monitor, analyze, and optimize their LLM applications. In this comparison, we'll explore two leading platforms in the LLM observability space: Helicone and Arize Phoenix. Both offer unique features and capabilities, but choosing the right tool can significantly impact your AI development workflow. -### Overview: Helicone vs. Arize Phoenix +## Overview: Helicone vs. Arize Phoenix | Feature | Helicone | Arize Phoenix | | -------------------- | -------- | ------------- | @@ -24,45 +24,13 @@ As the adoption of Large Language Models (LLMs) continues to grow, the need for | User Tracking | ✅ | ❌ | | User Feedback | ✅ | ❌ | ---- - -### Use Case Scenarios - -Different tools excel in different scenarios. Here's a quick guide to help you choose the right tool for your specific needs: - -1. **Small Startup with Limited Budget** - - - Best Tool: Helicone - - Why: Offers a free tier and flexible pricing, making it accessible for startups - -2. **Large Enterprise with Complex Workflows** - - - Best Tool: Helicone - - Why: Robust evaluation capabilities and scalability for enterprise-level needs - -3. **Research Team Focused on Experimentation** - - - Best Tool: Helicone - - Why: Comprehensive experiment features and prompt management - -4. **Solo Developer Working on Side Projects** - - - Best Tool: Helicone - - Why: Easy integration and user-friendly interface - -5. **AI-Focused Company with High Volume LLM Usage** - - Best Tool: Helicone - - Why: Advanced caching, cost analysis, and scalability features - ---- - -# **1. Helicone** +## 1. Helicone **Designed for: developers & analysts** ![Helicone Dashboard Image](/static/blog/arize-alternatives/helicone-dashboard.webp) -## What is Helicone? +### What is Helicone? Helicone is a comprehensive LLM observability platform designed for developers of all skill levels. It offers a wide range of features including request logging, caching, prompt management, and advanced analytics. With its open-source nature and self-hosting options, Helicone provides flexibility and control over your data. @@ -74,19 +42,19 @@ Helicone is a comprehensive LLM observability platform designed for developers o 4. **User Tracking** - Gain insights into user interactions and behaviors within your LLM-powered applications. 5. **Cost Analysis** - Monitor and optimize your LLM usage costs with detailed analytics. -## How does Helicone compare to Arize Phoenix? +### How does Helicone compare to Arize Phoenix? While both tools offer strong observability features, Helicone stands out with its user-friendly approach and comprehensive feature set. Unlike Arize Phoenix, Helicone provides self-hosting options, user tracking, and user feedback collection. Its flexible pricing model and free tier make it more accessible for smaller teams and individual developers. --- -# **2. Arize Phoenix** +## 2. Arize Phoenix **Designed for: Data scientists & ML engineers** ![Arize Phoenix Dashboard Image](/static/blog/arize-alternatives/arize-ai-dashboard.webp) -## What is Arize Phoenix? +### What is Arize Phoenix? Arize Phoenix is an open-source LLM observability tool that focuses on providing robust evaluation and monitoring capabilities for LLM applications. It offers features like tracing, prompt management, and performance analytics, making it suitable for data scientists and ML engineers working on complex AI projects. @@ -96,13 +64,43 @@ Arize Phoenix is an open-source LLM observability tool that focuses on providing 2. **Evaluation** - Comprehensive tools for assessing LLM performance and output quality. 3. **Agent Tracing** - Visualize and analyze multi-step LLM interactions and workflows. -## How does Arize Phoenix compare to Helicone? +### How does Arize Phoenix compare to Helicone? Arize Phoenix excels in its evaluation capabilities and is well-suited for data scientists and ML engineers working on complex LLM projects. However, it lacks some of the developer-friendly features that Helicone offers, such as self-hosting options, user tracking, and user feedback collection. Arize Phoenix's pricing model may also be less flexible compared to Helicone's tiered approach. --- -### So, which LLM observability tool suits you better? +## Arize vs. Helicone: which LLM observability tool suits you better? + +Different tools excel in different scenarios. Here's a quick guide to help you choose the right tool for your specific needs: + +1. **Small Startup with Limited Budget** + + - Best Tool: Helicone + - Why: Offers a free tier and flexible pricing, making it accessible for startups + +2. **Large Enterprise with Complex Workflows** + + - Best Tool: Helicone + - Why: Robust evaluation capabilities and scalability for enterprise-level needs + +3. **Research Team Focused on Experimentation** + + - Best Tool: Helicone + - Why: Comprehensive experiment features and prompt management + +4. **Solo Developer Working on Side Projects** + + - Best Tool: Helicone + - Why: Easy integration and user-friendly interface + +5. **AI-Focused Company with High Volume LLM Usage** + - Best Tool: Helicone + - Why: Advanced caching, cost analysis, and scalability features + +--- + +## Bottom Line Both Helicone and Arize Phoenix offer powerful features for LLM observability, but they cater to slightly different audiences. Helicone's user-friendly approach, comprehensive feature set, and flexible pricing make it an excellent choice for a wide range of users, from solo developers to small and medium-sized teams. Its self-hosting options and advanced features like user tracking and feedback collection give it an edge in many scenarios. @@ -110,9 +108,30 @@ Arize Phoenix, on the other hand, shines in its evaluation capabilities and may Ultimately, the choice between Helicone and Arize Phoenix depends on your specific needs, team size, and the complexity of your LLM applications. For most users, especially those looking for an all-in-one solution with a gentle learning curve, Helicone appears to be the more versatile and accessible option. + + +### You might be interested in + +- + Comparing Langsmith vs Helicone + +- + Comparing Braintrust vs Helicone + +- + Comparing Langfuse vs Helicone + + --- -### Frequently Asked Questions +## Frequently Asked Questions 1. **Q: What is the main difference between Helicone and Arize Phoenix?** A: The main difference lies in their target audience and feature set. Helicone is more developer-friendly with features like self-hosting and user tracking, while Arize Phoenix focuses on robust evaluation tools for data scientists and ML engineers. @@ -128,3 +147,5 @@ Ultimately, the choice between Helicone and Arize Phoenix depends on your specif 5. **Q: How do these tools handle data privacy and security?** A: Both tools take data privacy seriously. Helicone offers self-hosting options for complete data control, while Arize Phoenix, being open-source, allows for scrutiny of its security practices. Always review the latest security features and compliance certifications when making your decision. + + diff --git a/bifrost/app/blog/blogs/best-datadog-alternative-for-llm/src.mdx b/bifrost/app/blog/blogs/best-datadog-alternative-for-llm/src.mdx index b3dca197c5..d03e852b63 100644 --- a/bifrost/app/blog/blogs/best-datadog-alternative-for-llm/src.mdx +++ b/bifrost/app/blog/blogs/best-datadog-alternative-for-llm/src.mdx @@ -4,17 +4,17 @@ **Datadog** has long been a favourite among developers for its monitoring and observability capabilities. It is most known for its logging, analytics, and performance data visualization across the entire stack of applications and infrastructure, and can handle incredible amounts of data at scale. -But recently, Large Language Models (LLM) developers have been exploring open-source observability options. Why? We have some answers. +But recently, Large Language Models (LLM) developers have been exploring open-source observability options. Why? We have some answers. -### 1. Datadog Pricing Surge with Scale +## 1. Datadog Pricing Surge with Scale -Developers working in large companies have been complaining about the **increasing costs and bill shocks with Datadog** as their companies scale. Many are turning to open-source alternatives to control their overall costs on monitoring infrastructure. +Developers working in large companies have been complaining about the **increasing costs and bill shocks with Datadog** as their companies scale. Many are turning to open-source alternatives to control their overall costs on monitoring infrastructure. -### 2. The Need for Specialized LLM Observability Tools +## 2. The Need for Specialized LLM Observability Tools -**Speed is key in the GenAI space.** As LLMs continue to make their way into different industries, traditional monitoring is no longer enough. New problems require new solutions in order to facilitate LLM workflows, manage and experiment with prompts, and fine-tune models. Among the LLM observability tools, **Helicone is the one of the best option that developers today are choosing. ** +**Speed is key in the GenAI space.** As LLMs continue to make their way into different industries, traditional monitoring is no longer enough. New problems require new solutions in order to facilitate LLM workflows, manage and experiment with prompts, and fine-tune models. Among the LLM observability tools, **Helicone is the one of the best option that developers today are choosing. ** -**Customers love us for:** +### Customers love Helicone for: - our dead easy integration - our flat learning curve @@ -23,10 +23,9 @@ Developers working in large companies have been complaining about the **sign up. +To integrate with Helicone, you only need to modify two lines of code (we weren't joking earlier). Here’s where you can sign up. ![2-lines of code to integrate with Helicone](/static/blog/datadog/code.webp) @@ -34,23 +33,23 @@ Don't worry, We walk you through this step during onboarding. Let's dive into th ## 3 Most Useful Features in LLM Observability Tools -### 1. Create Prompt Templates, Run Experiments & Compare LLM Outputs +### 1. Prompt Templates, Experiments and LLM Evaluations -**Testing prompts is hard, but critical.** Consider this Tweet on how even minimal changes can significantly affect the accuracy of your output with **GPT-3.5 Turbo** and **Llama-2-70b 5-shot**. Helicone makes it easy to maintain and version your prompts, run experiments on prompt variations, and compare key metrics (token count, cost, accuracy, etc.) based on the LLM outputs. +**Testing prompts is hard, but critical.** Consider this tweet on how even minimal changes can significantly affect the accuracy of your output with **GPT-3.5 Turbo** and **Llama-2-70b 5-shot**. Helicone makes it easy to maintain and version your prompts, run experiments on prompt variations, and compare key metrics (token count, cost, accuracy, etc.) based on the LLM outputs. ![Create Prompt Templates, Run Experiments and Compare LLM Outputs](/static/blog/datadog/prompt_template.webp) -**Why our developers love this feature:** +#### Why developers love this feature: - Create as many prompt versions as you want **without impacting production data**. - Evaluate the LLM outputs of new prompts (**and have data to back you up** 📈) - Test a prompt with specific datasets to save costs by making fewer calls to providers like OpenAI 🤑 - Have **full ownership of your prompts**. Unlike many other prompt evaluation tools that require you to store the prompt with them, prompts are stored in your code when using Helicone. -**You might find this useful:** +#### You might find this useful: -- Docs: How to create prompt templates in Helicone. -- Guide: How to debug prompts in the Playground. +- Docs: How to create prompt templates in Helicone. +- Guide: How to debug prompts in the Playground. ### 2. Use Custom Properties to Segment Requests @@ -58,60 +57,76 @@ Don't worry, We walk you through this step during onboarding. Let's dive into th ![Add custom properties to requests](/static/blog/datadog/request.webp) -**Why our developers love this feature:** +#### Why developers love this feature: - Get the “**unit economics**”. For example, the average cost of a conversation. - Slice and dice your requests and metrics by any custom property to pinpoint errors. - Get the the total cost or latency for a group of requests in a prompt chain - Segment requests by different environments or different use-cases -**You might find this useful:** +#### You might find this useful: -- Docs: Using Custom Properties in Helicone. -- Blog: How to understand your users better with Custom Properties. +- Docs: Using Custom Properties in Helicone. +- Blog: How to understand your users better with Custom Properties. ### 3. Save LLM Costs by Caching on the Edge 🧗 -Helicone uses Cloudflare Workers in order to live as close to the edge as possible - guaranteeing minimal latency impact for features like caching. We also precompute results or frequently accessed data to reduce the load on backend resources. +Helicone uses Cloudflare Workers in order to live as close to the edge as possible - guaranteeing minimal latency impact for features like caching. We also precompute results or frequently accessed data to reduce the load on backend resources. **By using cached responses, one of our customer was able to save 386 hours in response time**. ![Caching on the edge](/static/blog/datadog/caching.webp) -**Why our developers love this feature:** +#### Why developers love this feature: - Faster response, low latency means you can develop your app more efficiently. - You save money by making fewer calls to OpenAI and other models. - **Better user experience for your users** -**You might find this useful:** -- Docs: How to Cache Responses in Helicone. +#### You might find this useful: + +- **How to Cache Responses in Helicone**. +- **5 Powerful Techniques to Slash Your LLM Costs by Up to 90%**. ## We use Datadog, too -Datadog is designed for infrastructure and application performance monitoring, rather than specifically for LLM observability needs. Our engineers at Helicone use and love it for error detection, setting up alerts, and aggregating metrics across our infrastructure. One key difference is that Datadog is ideal for technical users, whereas Helicone is intuitive to both technical and non-technical users. +Datadog is designed for infrastructure and application performance monitoring, rather than specifically for LLM observability needs. Our engineers at Helicone use and love it for error detection, setting up alerts, and aggregating metrics across our infrastructure. One key difference is that Datadog is ideal for technical users, whereas Helicone is intuitive to both technical and non-technical users. -### Comparing Helicone vs. Datadog Pricing +### Comparing Helicone vs. Datadog Pricing -We’ve observed that some customers find Datadog to be expensive, especially as usage scales up or if they're not utilizing all features efficiently. At Helicone, we take pride in our scalable solutions where you only pay for what you use. +We’ve observed that some customers find Datadog to be expensive, especially as usage scales up or if they're not utilizing all features efficiently. At Helicone, we take pride in our scalable solutions where you only pay for what you use. ## We believe in making your life easy -**Simplified Integration** +### 1-line Integration -Our platform is simple to set up and configure. No installation is required. +Helicone's platform is simple to set up and configure. No installation is required, integrate in seconds with just 1 line of code. -**Open-Source, Open Transparency** +### Open-Source, Open Transparency -Helicone is open-source, which allows us to stay attuned to the needs of the developer community and address a broad range of use cases and challenges. Being open sources also allows us to build Helicone that integrates seamlessly with your existing tech stack. +Helicone is open-source, which allows us to stay attuned to the needs of the developer community and address a broad range of use cases and challenges. Being open sources also allows us to build Helicone that integrates seamlessly with your existing tech stack. We've open-sourced our LLM stats and API pricing calculator. -**Gateway** +### Gateway -Helicone is a Gateway that gives you access to caching, rate limiting, API key management, and many middleware and advanced features. +Helicone is a Gateway that gives you access to caching, rate limiting, API key management, and many middleware and advanced features. -**Developer Experience** +### Developer Experience We've worked hard to minimize the learning curve for new users, and we are committed to providing the best customer support. ## Could we (Helicone) be a good fit for you? -We’re always happy to answer your questions. Want to chat with the founders? Schedule a call with us. \ No newline at end of file +We’re always happy to answer your questions. Want to chat with the founders? Schedule a call with us. + +### You might be interested in + +- + Comparing Langsmith vs Helicone + +- + Comparing Braintrust vs Helicone + +- + Comparing Arize AI vs Helicone + + + diff --git a/bifrost/app/blog/blogs/best-langfuse-alternatives/src.mdx b/bifrost/app/blog/blogs/best-langfuse-alternatives/src.mdx index 35a07c004d..7e632c531f 100644 --- a/bifrost/app/blog/blogs/best-langfuse-alternatives/src.mdx +++ b/bifrost/app/blog/blogs/best-langfuse-alternatives/src.mdx @@ -151,8 +151,29 @@ However, as data volume increases, PostgreSQL may face performance limitations. Choosing the right LLM observability tool depends on your specific needs and priorities. **Helicone** offers a scalable, feature-rich platform ideal for applications ranging from startups to large enterprises, especially where high performance and advanced analytics are required. **Langfuse** provides a simpler, self-hosted solution suitable for smaller teams or low volume projects that prioritize managing their own infrastructure. ---- - -## Other Helicone vs Langfuse Comparisons - -- Langfuse has its own comparison against Helicone live on [their website](https://langfuse.com/faq/all/best-helicone-alternative) + + +### You might be interested in + +- + Comparing Langsmith vs Helicone + +- + Comparing Braintrust vs Helicone + +- + Comparing Arize AI vs Helicone + + +### Other Helicone vs Langfuse Comparisons + +- Langfuse has its own comparison against Helicone live on their website. + + diff --git a/bifrost/app/blog/blogs/best-langsmith-alternatives/src.mdx b/bifrost/app/blog/blogs/best-langsmith-alternatives/src.mdx index 8c05e3d2a0..caea5c7201 100644 --- a/bifrost/app/blog/blogs/best-langsmith-alternatives/src.mdx +++ b/bifrost/app/blog/blogs/best-langsmith-alternatives/src.mdx @@ -1,21 +1,17 @@ - -Observability tools like LangSmith allow developers to monitor, analyze, and optimize AI model performance, which helps overcome the "black box" nature of LLMs. +Observability tools like LangSmith allow developers to monitor, analyze, and optimize AI model performance, which helps overcome the "black box" nature of LLMs. However, as LangSmith's on-premise version becomes expensive, users are seeking more flexible alternatives. **But which LangSmith alternative is the best in 2024?** We will shed some light. ![Comparing LangSmith Alternatives in July 2024](/static/blog/best-langsmith-alternatives/langsmith-cover.webp) +## Top LLM observability tools (updated July 2024) - -### Top LLM observability tools (updated July 2024) 1. Helicone 2. Phoenix by Arize 3. Langfuse 4. HoneyHive 5. OpenLLMetry by Traceloop ---- - ## LangSmith Competitors Overview | Feature | LangSmith | Helicone | Phoenix by Arize AI | Langfuse | HoneyHive | OpenLLMetry by Traceloop | @@ -38,7 +34,7 @@ However, as LangSmith's on-premise version becomes expensive, users are seeking --- -# 1. Helicone +## 1. Helicone **Designed for: developers & analysts** @@ -46,86 +42,86 @@ However, as LangSmith's on-premise version becomes expensive, users are seeking ![Track requests and agent workflows in real-time on Helicone's Request page](/static/blog/best-langsmith-alternatives/helicone-request-table.webp) +### What is Helicone? -## What is Helicone? -Helicone is an open-source LLM observability and monitoring platform purpose-built for developers to monitor, debug, and optimize their LLM applications. With the flexibility to be self-hosted or used as a gateway with a simple **1-line integration**, it provides instant insights into latency, costs, time to first tokens (TTFT) and more. - +Helicone is an open-source LLM observability and monitoring platform purpose-built for +developers to monitor, debug, and optimize their LLM applications. With the flexibility +to be self-hosted or used as a gateway with a simple **1-line integration**, it provides instant insights into latency, costs, time to first tokens +(TTFT) and more. ### Top features -1. **Sessions** - Group, track and visualize multi-step agent workflows and LLM interactions. -2. **Prompts & Experiments** - Version and test prompts, then compare outputs before going in production. -3. **Custom properties** - Segment data to understand your users better. -## How does Helicone compare to LangSmith? +1. **Sessions** - Group, track and visualize multi-step agent workflows and LLM interactions. +2. **Experiments** - Prevent prompt regression by testing prompts variations with production data and evaluating outputs. +3. **Prompts** - Version and test prompts, then compare outputs before pushing to production. +4. **Custom properties** - Segment data to understand your users better. + +### How does Helicone compare to LangSmith? | | LangSmith | Helicone | | :--------------------- | :-------- | :------- | | Open-source | - | ✅ | | Self-hosted | - | ✅ | | Prompt Templating | ✔ | ✅ | -| Agent Tracing | ✔ | ✅ | -| Experiments | ✔ | ✅ | -| Cost Analysis | ✔ | ✅ | -| Evaluation | ✔ | - | -| User Tracking | ✔ | ✅ | -| Feedback Tracking | ✔ | ✅ | -| LangChain Integration | ✔ | ✅ | -| Flexible Pricing | - | ✅ | -| Image support | - | ✅ | -| No payload limitations | - | ✅ | +| Agent Tracing | ✔ | ✅ | +| Experiments | ✔ | ✅ | +| Cost Analysis | ✔ | ✅ | +| Evaluation | ✔ | - | +| User Tracking | ✔ | ✅ | +| Feedback Tracking | ✔ | ✅ | +| LangChain Integration | ✔ | ✅ | +| Flexible Pricing | - | ✅ | +| Image support | - | ✅ | +| No payload limitations | - | ✅ | | Dashboard | ✔ | ✅ | -| Data Export | ✔ | ✅ | - +| Data Export | ✔ | ✅ | LangSmith currently primarily focuses on text-based LLM applications, with extensive tools for testing, monitoring, and debugging these applications, while Helicone offers support for **text and image inputs and outputs.** - -## Why are companies choosing Helicone? +### Why are companies choosing Helicone? **Open-Source & Self-Hosting** -Helicone is fully open-source and free to start. Companies can also self-host Helicone within their infrastructure. This ensures that you have full control over the application, flexibility and customization tailored to specific business needs. On the other hand, the self-host option is only available for users on enterprise plan for LangSmith. - +Helicone is fully open-source and free to start. Companies can also self-host Helicone within their infrastructure. This ensures that you have full control over the application, flexibility and customization tailored to specific business needs. Compare to LangSmith, the self-host option is only available for users on enterprise plan. **Cost-Effective** -Helicone is also more cost-effective than LangSmith as it operates on a volumetric pricing model. This means companies only pay for what they use (**while the first 100k requests every month are free),** which makes Helicone an easy and flexible platform for businesses to get started and scale their applications. +Helicone is also more cost-effective than LangSmith as it operates on a volumetric pricing model. This means companies only pay for what they use **(while the first 10k requests every month are free),** which makes Helicone an easy and flexible platform for businesses to get started and scale their applications. **Scalable & Reliable** -Helicone can also handle a large volume of requests, making it a dependable option for businesses with high traffic. Acting as a Gateway, Helicone offers a suite of both middleware and advanced features such as caching, prompt thread detection and vaults to securely share API keys. +Helicone can also handle billions of requests, making it a dependable option for businesses with high traffic. Acting as a Gateway, Helicone offers a suite of both middleware and advanced features such as caching, prompt thread detection and vaults to securely share API keys. +Companies that are highly responsive to market changes or opportunities often use Helicone to achieve production quality faster. -Companies that are highly responsive to market changes or opportunities often use Helicone to achieve production quality faster. - - -## Bottom Line - -If you need something that "just works" so you can get back to shipping features, Helicone has the core functionality to help you get started instantly. +### Bottom Line +If you need something that "just works" so you can get back to shipping features, Helicone has the core functionality to help you get started instantly. -**→ Start for free** +**→ Start for free** --- -# 2. Phoenix by Arize AI +## 2. Phoenix by Arize AI **Designed for: ML engineers & ML Ops team** ![Arize AI dashboard](/static/blog/best-langsmith-alternatives/arize-ai-dashboard.webp) +### What is Arize AI? -## What is Arize AI? - -Phoenix by Arize AI is known for its strong focus on machine learning model monitoring and explainability. If your company prioritizes understanding model performance in production, detecting model drift, and getting detailed explanations of model predictions, Arize AI might be the better choice. +Phoenix by Arize AI is known for its strong focus on machine learning model monitoring and explainability. +If your company prioritizes understanding model performance in production, detecting +model drift, and getting detailed explanations of model predictions, Arize AI might +be the better choice. -**Top features** +### Top features 1. **Evaluations** - Judge the quality of your LLM outputs on relevance, hallucination %, and latency. 2. **Traces** - Get visibility into the lifecycle of predictions, monitor and analyze performance, identify root cause for machine learning models. 3. **Datasets & Experiments** - Understand how a change will affect performance, and test on a specific dataset. -## How does Arize AI compare to LangSmith? +### How does Arize AI compare to LangSmith? | Feature | LangSmith | Phoenix by Arize AI | | :--------------------- | :-------- | :------------------ | @@ -145,8 +141,7 @@ If you need something that "just works" so you can get back to shipping features | Dashboard | ✔ | ✔ | | Data Export | ✔ | ✔ | - -## Why are companies choosing Arize AI? +### Why are companies choosing Arize AI? **Machine Learning Observability** @@ -154,13 +149,12 @@ Arize AI specializes in real-time monitoring and performance optimization of mac **Ease of Integration** -Arize AI stands out for its support for integration with various machine learning frameworks to help streamline the process of setting up and monitoring models. It also provides visualizations and analytics to help understand model behaviors and impact. +Arize AI stands out for its support for integration with various machine learning frameworks to help streamline the process of setting up and monitoring models. It also provides visualizations and analytics to help understand model behaviors and impact. **Designed for ML Engineers & ML Ops Team** Arize AI attracts users who need robust ML monitoring, explainability, and scalability, primarily data scientists, ML engineers, and ML ops teams, whereas LangSmith appeals to software engineers, content creators, and researchers who are focused on building and applying language models in different contexts. - ### Bottom Line For developers focused on enhancing model performance, Arize AI stands out due to its capabilities in monitoring and analyzing model performance. However, it's worth noting that Arize AI's emphasis may not include traditional user feedback tracking, such as gathering user comments or sentiment. @@ -173,17 +167,20 @@ For developers focused on enhancing model performance, Arize AI stands out due t ![Langfuse Traces view](/static/blog/best-langsmith-alternatives/langfuse-traces.webp) -## What is Langfuse? +### What is Langfuse? -Langfuse is an open-source LLM Engineering Platform that helps to trace & debug LLM models. It provides observability, metrics, evals, prompt management and a playground and to debug and improve LLM apps. +Langfuse is an open-source LLM Engineering Platform that helps to trace & debug LLM models. +It provides observability, metrics, evals, prompt management and a playground and +to debug and improve LLM apps. -**Top features** +### Top features 1. **Tracing** - made for agents & LLM chains. You can trace unlimited nested actions and get a detailed view of the entire request, including non-LLM actions such as database queries, API calls that lead to the response for optimal visibility into issues. 2. **Scoring production traces** - measuring quality with user feedback, model-based evaluation, manual labelling and others. 3. **Montioring and Logging** - detailed logging to track all interactions with the language model, error tracking for debugging, and usage analytics to optimize deployment. -## How does Langfuse compare to LangSmith? +### How does Langfuse compare to LangSmith? + | Feature | LangSmith | Langfuse | | :--------------------- | :-------- | :------- | | Open-source | - | ✔ | @@ -202,52 +199,49 @@ For developers focused on enhancing model performance, Arize AI stands out due t | Dashboard | ✔ | ✔ | | Data Export | ✔ | ✔ | - -## Why are companies choosing Langfuse? +### Why are companies choosing Langfuse? **Open-Source Flexibility** - -Langfuse is open-source, which means it offers flexibility for customization and adaptation to specific organizational needs without vendor lock-in. - + +Langfuse is open-source, which means it offers flexibility for customization and adaptation to specific organizational needs without vendor lock-in. + **Cost-Effectiveness** - + Langfuse can be more cost-effective compared to LangSmith, which requires investment in enterprise plans for full feature access and support. - + **Framework-agnostic tracing capabilities** - -Langfuse offers comprehensive tracing capabilities that are model and framework agnostic. It allows for capturing the full context of LLM applications, including complex and chained calls, which simplifies debugging and pinpointing issues across extended control flows, while specific features like automated instrumentation for frameworks may require additional setup or integration effort using LangSmith. - -### Bottom Line +Langfuse offers comprehensive tracing capabilities that are model and framework agnostic. It allows for capturing the full context of LLM applications, including complex and chained calls, which simplifies debugging and pinpointing issues across extended control flows, while specific features like automated instrumentation for frameworks may require additional setup or integration effort using LangSmith. -Langfuse is a good choice for teams looking to improve their LLM applications with a simple and cost-effective tool, but may be limited for larger teams who want a scalable solution or enterprise features. +### Bottom Line +Langfuse is a good choice for teams looking to improve their LLM applications with a simple and cost-effective tool, but may be limited for larger teams who want a scalable solution or enterprise features. --- - # 4. HoneyHive **Designed for: developers & analysts** ![HoneyHive AI Dashboard](/static/blog/best-langsmith-alternatives/honeyhive-dashboard.webp) +### What is HoneyHive? -## What is HoneyHive? - -HoneyHive AI evaluates, debugs, and monitors production LLM applications. It lets you trace execution flows, customize event feedback, and create evaluation or fine-tuning datasets from production logs. +HoneyHive AI evaluates, debugs, and monitors production LLM applications. It lets you trace +execution flows, customize event feedback, and create evaluation or fine-tuning datasets +from production logs. It is built for teams who want to build reliable LLM products because it focuses on observability through performance tracking. -**Top features** +### Top features 1. **Trace** - Log all AI application data to debug execution steps as you iterate. -2. **Evaluate** - Evaluations SDK for flexible offline evaluations across various LLM applications +2. **Evaluate** - Evaluations SDK for flexible offline evaluations across various LLM applications 3. **Annotate Logs** - Involve domain experts to review and annotate logs. HoneyHive's tracing functionality includes support for multi-modal data, which encompasses image processing. This feature allows you to trace functions that handle various types of data, including images. -## How does HoneyHive compare to LangSmith? +### How does HoneyHive compare to LangSmith? | Feature | LangSmith | HoneyHive | | :--------------------- | :-------- | :-------- | @@ -267,33 +261,33 @@ HoneyHive's tracing functionality includes support for multi-modal data, which e | Dashboard | ✔ | ✔ | | Data Export | ✔ | ✔ | +### Bottom Line -## Bottom Line - -HoneyHive provides access to 100+ open-source models in their Playground through integrations for testing purpose. However, if you want a solution that allows you to plug and play, it's worth it to look into other solutions. - +HoneyHive provides access to 100+ open-source models in their Playground through integrations for testing purpose. However, if you want a solution that allows you to plug and play, it's worth it to look into other solutions. --- - -# 5. OpenLLMetry by Traceloop +## 5. OpenLLMetry by Traceloop **Designed for: developers & analysts** ![Traceloop Traces view](/static/blog/best-langsmith-alternatives/traceloop-traces.webp) -## What is OpenLLMetry? +### What is OpenLLMetry? -OpenLLMetry is an open-source framework developed by **Traceloop**, that simplifies the process of monitoring and debugging Large Language Models. It is built on top of OpenTelemetry, ensuring non-intrusive tracing and seamless integration with leading observability platforms and backends like [KloudMate](https://docs.kloudmate.com/openllmetry-opentelemetry-based-observability-for-llms). +OpenLLMetry is an open-source framework developed by **Traceloop**, that simplifies the +process of monitoring and debugging Large Language Models. It is built on top of +OpenTelemetry, ensuring non-intrusive tracing and seamless integration with leading +observability platforms and backends like KloudMate. OpenLLMetry aims to standardize the collection of mission-critical LLM metrics, spans, traces, and logs through OpenTelmetry. -**Top features** +### Top features 1. **Tracing** - Traceloop SDK supports several ways to annotate workflows, tasks, agents and tools in your code to get a more complete picture of your app structure. 2. **Prompt versioning** - User feedback → Simply log a user feedback on the result of your LLM workflow by calling Traceloop's Python SDK or Typescript SDK to. -## How does Traceloop compare to LangSmith? +### How does Traceloop compare to LangSmith? | Feature | LangSmith | OpenLLMetry by Traceloop | | :--------------------- | :-------- | :----------------------- | @@ -313,8 +307,10 @@ OpenLLMetry aims to standardize the collection of mission-critical LLM metrics, | Dashboard | ✔ | ✔ | | Data Export | ✔ | - | -## Bottom Line +### Bottom Line + +Traceloop focuses on evaluation and the pricing reflects that, thus can become expensive as your application scale to log more traces. -Traceloop focuses on evaluation and the pricing reflects that, thus can become expensive as your application scale to log more traces. +#### So, which LangSmith alternatives suits you better? -#### So, which LangSmith alternatives suits you better? \ No newline at end of file + diff --git a/bifrost/app/blog/blogs/braintrust-alternatives/src.mdx b/bifrost/app/blog/blogs/braintrust-alternatives/src.mdx index ab26b53896..f16faa5b84 100644 --- a/bifrost/app/blog/blogs/braintrust-alternatives/src.mdx +++ b/bifrost/app/blog/blogs/braintrust-alternatives/src.mdx @@ -170,16 +170,33 @@ While Braintrust shines as an advanced evaluation suite, it lacks a dashboard an Helicone provides a one-line integration and supports various popular tools. Braintrust may require more effort due to its lack of simple integration methods and potentially confusing UI. ---- - ## Conclusion -Choosing between Helicone and Braintrust depends on your project's priorities. If you require a scalable, **feature-rich platform** with comprehensive observability, advanced analytics, and an intuitive interface that supports all team sizes and use cases, **Helicone** is the superior choice. While Helicone is in the process of adding built-in advanced evaluation features, **it currently allows you to post custom evaluation results via its API**, accommodating bespoke evaluation needs. For enterprise projects that prioritize advanced evaluations, **Braintrust** offers robust tools but may not be fully comprehensive for all needs. +Choosing between Helicone and Braintrust depends on your priorities. ---- +If you require a scalable, **feature-rich platform with comprehensive observability**, and an intuitive interface that supports all team sizes and use cases, **Helicone** is the superior choice. -For further reading, check out our previous comparison: [Langfuse Alternatives? Langfuse vs Helicone](/blog/best-langfuse-alternatives). +For enterprise projects that prioritize advanced evaluations, **Braintrust** offers robust tools but may not be fully comprehensive for all needs. While Helicone is in the process of adding built-in advanced evaluation features, it currently allows you to post custom evaluation results via its API, accommodating bespoke evaluation needs. ---- + + +### You might be interested in + +- + Comparing Langfuse vs Helicone + +- + Comparing Portkey vs Helicone + +- + Comparing Langsmith vs Helicone + -**Ready to enhance your LLM observability and scalability? [Get started with Helicone](https://helicone.ai/signup) for free today**. + diff --git a/bifrost/app/blog/blogs/building-an-llm-stack/src.mdx b/bifrost/app/blog/blogs/building-an-llm-stack/src.mdx index ef8b859e38..35aba1d605 100644 --- a/bifrost/app/blog/blogs/building-an-llm-stack/src.mdx +++ b/bifrost/app/blog/blogs/building-an-llm-stack/src.mdx @@ -2,16 +2,13 @@ Figuring out the right tech stack can be challenging. This simplified guide illu At Helicone, we've observed thousands of LLM applications at various scales. This article generalizes the different stages that most applications typically go through. -We also wrote a [complementary blog](/blog/llm-stack-guide) that delves deeper into the LLM Stack and Helicone's role within it. +We also wrote a complementary blog that delves deeper into the LLM Stack and Helicone's role within it. -## Example: Evolution of a Chatbot +## Example: Evolution of a Chatbot - - Let's consider a simple internal chatbot designed to help employees of a small - business manage their inbox. - +Let's consider a simple internal chatbot designed to help employees of a small business manage their inbox. -### `Stage 1` - Basics +### Stage 1: The Basics Initially, you can simply copy and paste the last 10 emails into the context. @@ -35,37 +32,37 @@ User: What is the status of the order with the id 123456? ``` -### `Stage 2` - Observability +### Stage 2: Observability As your app gains popularity, you may find yourself spending $100 a day on OpenAI. At this stage, basic observability becomes essential. ![LLM Stack Example - Stage 2](/static/pasted_images/llm-stack-ex-stage-2.png) -### `Stage 3` - Scaling +### Stage 3: Scaling Users may complain that the chatbot only considers the last 10 emails. To address this, implement a Vector DB to store all emails and use embeddings to retrieve the 10 most relevant ones. ![LLM Stack Example - Stage 3](/static/pasted_images/llm-stack-ex-stage-3.png) -### `Stage 4` - Gateway +### Stage 4: Gateway To manage costs, you may need to rate-limit users and add a caching layer. This is where a gateway comes into play. ![LLM Stack Example - Stage 4](/static/pasted_images/llm-stack-ex-stage-4.png) -### `Stage 5` - Tools +### Stage 5: Tools Enhance functionality by adding tools that perform actions on behalf of users, such as marking emails as read or adding events to a calendar. ![LLM Stack Example - Stage 5](/static/pasted_images/llm-stack-ex-stage-5.png) -### `Stage 6` - Prompting +### Stage 6: Prompting Implement a robust prompt management solution to handle prompt versions for testing and observability. ![LLM Stack Example - Stage 6](/static/pasted_images/llm-stack-ex-stage-6.png) -### `Stage 7` - Agents +### Stage 7: Agents Some actions may require multiple tool calls in a loop, where tools decide on the next action. This is where Agents come into play. @@ -73,20 +70,22 @@ Some actions may require multiple tool calls in a loop, where tools decide on th Agents are advanced integrations that operate within complex environments, allowing for sophisticated interactions through prompts instead of direct provider calls. -### `Stage 8` - Model Load Balancer +### Stage 8: Model Load Balancer As your application grows, different models may be better suited for specific tasks. A model load balancer can help distribute the workload effectively. ![LLM Stack Example - Stage 8](/static/pasted_images/llm-stack-ex-stage-8.png) -### `Stage 9` - Testing +### Stage 9: Testing To make data actionable, implement a testing framework that provides insights and evaluators to assess the quality of your model's outputs. ![LLM Stack Example - Stage 9](/static/pasted_images/llm-stack-ex-stage-9.png) -### `Stage 10` - Fine Tuning +### Stage 10: Fine Tuning Fine-tuning is typically employed for workloads requiring significant customization, especially when optimizing for specific problems or cost savings. ![LLM Stack Example - Stage 10](/static/pasted_images/llm-stack-ex-stage-10.png) + + diff --git a/bifrost/app/blog/blogs/claude-3.5-sonnet-vs-openai-o1/src.mdx b/bifrost/app/blog/blogs/claude-3.5-sonnet-vs-openai-o1/src.mdx index 9ff75758d0..06aa1aa9e8 100644 --- a/bifrost/app/blog/blogs/claude-3.5-sonnet-vs-openai-o1/src.mdx +++ b/bifrost/app/blog/blogs/claude-3.5-sonnet-vs-openai-o1/src.mdx @@ -1,8 +1,8 @@ -The market is more crowded than ever, with models offering a dizzying array of capabilities. If you’re building tools for coding, tackling complex reasoning problems, or optimizing workflows, two advanced models stand out: **Claude 3.5 Sonnet** and **OpenAI o1**. +The market is more crowded than ever, with models offering a dizzying array of capabilities. If you’re building tools for coding, tackling complex reasoning problems, or optimizing workflows, two advanced models stand out: **Claude 3.5 Sonnet** and **OpenAI o1**. ![A comprehensive comparison of Claude 3.5 Sonnet and OpenAI o1](/static/blog/claude-3.5-sonnet-vs-openai-o1/cover.webp) -In this guide, we will help you understand which model suits your needs best. We’ll break down their **performance, pricing, coding abilities, and special features** to give you a clear picture of what each model brings to the table. After all, the right choice can make a significant difference in both your productivity and budget. +In this guide, we will help you understand which model suits your needs best. We’ll break down their **performance, pricing, coding abilities, and special features** to give you a clear picture of what each model brings to the table. After all, the right choice can make a significant difference in both your productivity and budget. ## Why Compare Claude 3.5 Sonnet and OpenAI o1? @@ -21,7 +21,7 @@ Choosing the right model can mean the difference between saving hours on coding --- -# Core Differences Between Claude 3.5 Sonnet and o1 +## Key Differences Between Claude 3.5 Sonnet and o1 **OpenAI o1 is built for complex reasoning and problem-solving**. Its deep, thoughtful responses make it perfect for developers working on intricate issues or requiring detailed explanations. o1 excels in tasks that demand precision and depth, such as advanced mathematics or scientific analysis. @@ -29,7 +29,7 @@ In contrast, **Claude 3.5 Sonnet focuses on spe Let’s dive deeper into the performance, cost and speed. -## 1. Performance: Coding, Debugging, and Advanced Reasoning +### 1. Performance: Coding, Debugging, and Advanced Reasoning In general, Claude’s strength lies in rapid generation and its simplicity. OpenAI’s o1 is better for deep reasoning and debugging. @@ -40,7 +40,7 @@ In general, Claude’s strength lies in rapid generation and its simplicity. Ope | **Advanced Reasoning** | ✅ Handles general reasoning tasks effectively but focuses more on productivity and speed. | ✅ Excels in deep reasoning and problem-solving, good for scientific research and advanced mathematics. | | **Example Use Cases** | 1. Generating setup files for 50+ APIs in short time.

2. Detecting overlooked security vulnerabilities and debugging nested functions. | 1. Debugging React state management issues and legacy code refactoring.

2. Using o1's to explain algorithmic issues for debugging. | -## 2. Cost Efficiency +### 2. Cost Efficiency **Claude 3.5 Sonnet is 4x cheaper than OpenAI o1**. It’s ideal for budget-conscious users who need reliable performance for everyday coding tasks, whereas o1 is best for users tackling high-value projects where advanced reasoning and context retention justify the cost. @@ -59,7 +59,7 @@ In general, Claude’s strength lies in rapid generation and its simplicity. Ope secondaryButtonLink="https://www.helicone.ai/llm-cost/provider/anthropic/model/claude-3-5-sonnet-20241022" /> -## 3. Context Window and Speed +### 3. Context Window and Speed **Claude 3.5 Sonnet can handle about 150% more tokens compared to OpenAI o1**, giving it an edge for tasks requiring extensive context retention. The size of a context window is essential in determining how well AI models manage large inputs or extended conversations. @@ -72,13 +72,13 @@ In general, Claude’s strength lies in rapid generation and its simplicity. Ope --- -# What’s new in the upgraded Claude 3.5 Sonnet - October 2024 +## What’s new in the upgraded Claude 3.5 Sonnet - October 2024 On October 22, 2024, Anthropic released a new version of Claude 3.5 Sonnet, improving its SWE-bench Verified benchmark score from `33.4%` to `49.0%`, surpassing all publicly available models including OpenAI's o1-preview at 41.0%. The upgraded model shows wide-ranging improvements across various industry benchmarks, particularly in coding and tool use tasks. Anthropic also introduced Computer Use capability that allows Claude 3.5 Sonnet to perform tasks by interacting with user interfaces (UI), such as generating keystrokes and mouse clicks. -## Model Benchmarks +### Model Benchmarks Benchmarks help measure speed, accuracy, and efficiency between models. Let's look at real-world benchmarks to understand how o1 and Claude 3.5 Sonnet performs in practical applications. @@ -97,13 +97,13 @@ _💡 Keep in mind that o1 uses more compute to achieve its advanced reasoning c --- -# Comparing Claude 3.5 Sonnet and o1 on Coding Tasks +## Comparing Claude 3.5 Sonnet and o1 on Coding Tasks We will now look at some examples of how Claude 3.5 Sonnet and o1 perform on generating simple functions, debugging, and writing unit tests. ![Comparing Claude 3.5 Sonnet and o1 on Coding Tasks](/static/blog/claude-3.5-sonnet-vs-openai-o1/benchmark-comparisons.webp) -## Example 1: Generating a Simple Function +### Example 1: Generating a Simple Function **Prompt:** Write a Python function that takes a list of integers and returns the sum of all even numbers in the list. The function should handle empty lists and lists containing only one element. @@ -114,11 +114,11 @@ We will now look at some examples of how Claude 3.5 Sonnet and o1 perform on gen | **Claude 3.5 Sonnet** | Used a simple loop to iterate through the list, with clear variable names and minimal comments. Clean and straightforward solution. | | **OpenAI o1** | Addressed all the requirements and handled edge cases well (i.e.empty lists and lists with one element). The code was well-structured. | -### Which model performed better? +#### Which model performed better? Claude 3.5 Sonnet provided a simpler and more elegant solution, while OpenAI o1's approach was more thorough with edge case handling. -## Example 2: Debugging Code +### Example 2: Debugging Code **Prompt:** Debug this JavaScript function to remove all vowels from a string. @@ -135,11 +135,11 @@ function removeVowels(str) { | **Claude 3.5 Sonnet** | Clearly pinpointed the mistake and provided a straightforward and simple-to-understand solution. | | **OpenAI o1** | Quickly identified the issue and offered an efficient solution that addresses the problem directly without adding unnecessary complexity. | -### Which model performed better? +#### Which model performed better? Both models showed strengths in debugging. OpenAI o1 is direct, while Claude 3.5 Sonnet provided a more user-friendly solution. This aligns with reports that o1 excels in complex problem-solving while Claude is preferred for simpler tasks. -## Example 3: Writing Unit Tests +### Example 3: Writing Unit Tests **Prompt:** Develop a set of unit tests for a function that takes a list of strings as input and returns a new list containing only the strings that are palindromes. The function should handle empty lists and lists containing only one element. @@ -150,7 +150,7 @@ Both models showed strengths in debugging. OpenAI o1 is direct, while Claude 3.5 | **Claude 3.5 Sonnet** | Produced a comprehensive set of unit tests covering all edge cases such as empty lists and single-element lists. | | **OpenAI o1** | o1 also generated a solid set of unit tests, but lacked depth in addressing critical edge cases such as lists with multiple palindrome and non-palindrome strings. | -### Which model performed better? +#### Which model performed better? This test indicates that Claude 3.5 Sonnet is more efficient in generating thorough unit tests, while o1's tests are functional but could be improved. @@ -164,7 +164,7 @@ This test indicates that Claude 3.5 Sonnet is more efficient in generating thoro /> --- -# Choosing the Right AI Model +## Choosing the Right AI Model To select your ideal model, consider how their unique features help you meet your requirements. @@ -185,20 +185,16 @@ OpenAI's - O1 (and ChatGPT Pro) — here's everything you need to know +- + OpenAI Unveils New O3 Model: What Is It and How Is It Different from O1? - ---- - -## Questions or feedback? - -Are the information out of date? Please raise an issue and we'd love to hear your insights! + diff --git a/bifrost/app/blog/blogs/cole-github-copilot/src.mdx b/bifrost/app/blog/blogs/cole-github-copilot/src.mdx index 05bfb361d5..2e1160411e 100644 --- a/bifrost/app/blog/blogs/cole-github-copilot/src.mdx +++ b/bifrost/app/blog/blogs/cole-github-copilot/src.mdx @@ -1,37 +1,39 @@ -GitHub Copilot is a quintessential development tool that our team absolutely loves. This week, Helicone's co-founder, Cole, shares how he uses Copilot in daily operations, and the most counterintuitive principle he learned while running a startup. - -**No BS, no affiliations.** Just genuine opinions from Helicone's co-founder. - + + GitHub Copilot + is a quintessential development tool that our team absolutely loves. This week, +Helicone's co-founder, Cole, shares how he uses Copilot in daily operations, and +the most counterintuitive principle he learned while running a startup.{" "} + +**No BS, no affiliations.** Just genuine opinions from Helicone's co-founder. ![Our co-founder's Take on GitHub Copilot](/static/blog/cole-copilot.webp) - ## An Intro -Two weeks ago, on Microsoft's earnings call, CEO Satya Nadella announced that GitHub Copilot has 1.8 million paying subscribers. But it doesn't stop there - Copilot's growth is accelerating. In just a year, it has doubled its user base, **from 45% of Fortune 100 companies that were using GitHub Copilot to now 90%**. - +Two weeks ago, on Microsoft's earnings call, CEO Satya Nadella announced that GitHub Copilot has 1.8 million paying subscribers. But it doesn't stop there - Copilot's growth is accelerating. In just a year, it has doubled its user base, **from 45% of Fortune 100 companies that were using GitHub Copilot to now 90%**. +## Why do you love GitHub Copilot? -## Why do you love GitHub Copilot? +As a developer, I need tools to be intuitive, fast, and most importantly, built into my workflow. GitHub Co-pilot significantly speeds up my development time. In addition: -As a developer, I need tools to be intuitive, fast, and most importantly, built into my workflow. GitHub Co-pilot significantly speeds up my development time. In addition: -1. I don't need to remember all syntax specifics - more language-agnostic. -2. I don't need to type out boilerplate code. +1. I don't need to remember all syntax specifics - more language-agnostic. +2. I don't need to type out boilerplate code. 3. I find it helpful for ideating. If I feel stuck, it often suggests ideas that get my brain going. +## Share the backstory of how Helicone discovered Copilot. -## Share the backstory of how Helicone discovered Copilot. - -As a developer, GitHub Co-pilot naturally became part of my suite of tools. Some companies have concerns about protecting their intellectual property and are hesitant to store their code externally. However, as an open-source company, this is not a concern for us. - +As a developer, GitHub Co-pilot naturally became part of my suite of tools. Some companies have concerns about protecting their intellectual property and are hesitant to store their code externally. However, as an open-source company, this is not a concern for us. ## How does Helicone use Copilot day-to-day? -We use GitHub Copilot frequently as it is integrated into our daily workflow. As a company, we provide access to Copilot for every developer on our team. The efficiency gains far outweigh the cost. +We use GitHub Copilot frequently as it is integrated into our daily workflow. As a company, we provide access to Copilot for every developer on our team. The efficiency gains far outweigh the cost. If you're working in a startup, I definitely recommend purchasing a GitHub Copilot subscription for your team. - ## How has Copilot transformed Helicone's workflow? - **Automate boilerplate code**. Or find the correct syntax for different programming languages, which significantly speeds up my development time. @@ -39,39 +41,35 @@ If you're working in a startup, I definitely recommend purchasing a GitHub Copil This feature would be a perfect addition to almost all products. For instance, having similar suggestions while typing in Notion would be amazing. - ## What are your tips to get the most out of Copilot? - **Accept shorter recommendations.** GitHub Copilot is best at handling boilerplate code that is quite obvious and seen often but may struggle with longer and more complicated logic that requires an understanding of different contexts. -- **Learn to prompt suggestions.** Sometimes GitHub Copilot doesn't always show suggestions when you need one, try typing a few characters to prompt it. For example, typing ` con ` will suggest ` console.log `, and adding `'` to `console.log('` will prompt it to write a message for the log and print out important variables. - - +- **Learn to prompt suggestions.** Sometimes GitHub Copilot doesn't always show suggestions when you need one, try typing a few characters to prompt it. For example, typing `con` will suggest `console.log`, and adding `'` to `console.log('` will prompt it to write a message for the log and print out important variables. ## Who would you recommend Copilot to? I recommend GitHub Copilot to every company and every developer. For open-source companies like Helicone, the efficiency gain from using GitHub Copilot is substantial, making it a worthwhile investment. - ## 🌶️ What's a hot take you have about Copilot? -People often argue that GitHub Copilot prevents developers from learning syntax. I think the future of programming lies in collaborating with AI and focusing more on higher-level logic than the intricacies of programming languages. +People often argue that GitHub Copilot prevents developers from learning syntax. I think the future of programming lies in collaborating with AI and focusing more on higher-level logic than the intricacies of programming languages. -While having a copilot is great, I'd recommend to not rely too heavily on Copilot. Instead: +While having a copilot is great, I'd recommend to not rely too heavily on Copilot. Instead: -- **Think critically.** Don't accept everything it suggests. Complex problems require a deeper understanding of context. +- **Think critically.** Don't accept everything it suggests. Complex problems require a deeper understanding of context. - **Use where it truly increases efficiency.** Like handling boilerplate code. - **Review the code carefully.** Aimlessly accepting recommnedations can decrease productivity and introduce more bugs. - - ## 🌶️ What's the most counterintuitive principle you've learned from working at a startup? -**Embrace bugs as part of the development process.** +**Embrace bugs as part of the development process.** -This is probably not something you're used to in a corporate setting where weeks are spent on extensive testing and QA. +This is probably not something you're used to in a corporate setting where weeks are spent on extensive testing and QA. At Helicone, we launch features in beta and get feedback quickly to make sure it's something that people want. This allows us to see how people are actually using the features and iterate based on real-world use cases, even if it means having potential bugs. Over-engineering, especially in the early stages, can often lead to a “perfect” product that ultimately no one asked for. -However, handling critical parts of the codebase is different. For critical components like our proxy service, we conduct rigorous testing. We barely release new code for it, wrap everything in try-catch blocks and use Cloudflare Workers to ensure reliability. As a result, we have been maintaining an uptime of `99.9999%` over the last year. +However, handling critical parts of the codebase is different. For critical components like our proxy service, we conduct rigorous testing. We barely release new code for it, wrap everything in try-catch blocks and use Cloudflare Workers to ensure reliability. As a result, we have been maintaining an uptime of `99.9999%` over the last year. For non-critical parts of the codebase, take the risk of having bugs. It's okay to be embarrassed about having bugs in beta releases. If people love your product enough, they're generally forgiving, especially if the bugs don't affect their core business operation. + + diff --git a/bifrost/app/blog/blogs/crewai-vs-autogen/src.mdx b/bifrost/app/blog/blogs/crewai-vs-autogen/src.mdx index a9c3ea7dc2..3c94f4560e 100644 --- a/bifrost/app/blog/blogs/crewai-vs-autogen/src.mdx +++ b/bifrost/app/blog/blogs/crewai-vs-autogen/src.mdx @@ -1,24 +1,19 @@ -Selecting the right framework to power your agents is crucial for efficiency and scalability. Two notable **open-source frameworks** in the AI agent landscape are **CrewAI** and **AutoGen**. CrewAI offers collaborative and team-oriented workflows, while AutoGen offers finely tuned control for more intricate, iterative problem-solving. +Selecting the right framework to power your agents is crucial for efficiency and scalability. Two notable **open-source frameworks** in the AI agent landscape are **CrewAI** and **AutoGen**. CrewAI offers collaborative and team-oriented workflows, while AutoGen offers finely tuned control for more intricate, iterative problem-solving. ![CrewAI vs. AutoGen for building AI Agents](/static/blog/crewai-vs-autogen.webp) Both platforms are powerful and cater to different aspects of AI application development. Depending on your project’s specific needs, you might find one edges out the other. We will cover the key differences, example implementations and share our recommendations if you are starting out in agent-building. - ## Key Differences -| Feature | CrewAI | AutoGen | -| --- | --- | --- | -| Ease of use | More accessible and easier to set up, built on LangChain. | May require more effort to set up initially, but offers more flexibility for specialized tasks. | -| Functionality | Provides more control over the process, suited for automating known workflows. | More capable for open-ended problem-solving and exploring unknown solutions. | -| Code Execution | Leverages LangChain's ecosystem for language understanding. | Has better default code execution capabilities, using Docker for isolation. | -| LLM Support | Has dependencies on OpenAI, limiting for other LLMs. | More reliant on OpenAI's GPT models, which can be limiting. | - ---- +| Feature | CrewAI | AutoGen | +| -------------- | ------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------- | +| Ease of use | More accessible and easier to set up, built on LangChain. | May require more effort to set up initially, but offers more flexibility for specialized tasks. | +| Functionality | Provides more control over the process, suited for automating known workflows. | More capable for open-ended problem-solving and exploring unknown solutions. | +| Code Execution | Leverages LangChain's ecosystem for language understanding. | Has better default code execution capabilities, using Docker for isolation. | +| LLM Support | Has dependencies on OpenAI, limiting for other LLMs. | More reliant on OpenAI's GPT models, which can be limiting. | -## Overview - -### [CrewAI](https://www.crewai.com/) — Structured collaboration +## CrewAI — Structured collaboration `free` `open-source` @@ -29,7 +24,6 @@ CrewAI is a Python-based framework that implements a hierarchical, role-based ar - Configurable interaction patterns - Built-in workflow orchestration - ### CrewAI implementation example ```python @@ -68,23 +62,52 @@ crew = Crew( ) ``` -*Note: Always check [CrewAI's official documentation](https://docs.crewai.com/concepts/agents) for the most up-to-date information and best practices.* +_Note: Always check CrewAI's official documentation for the most up-to-date information and best practices._ + +### Use Cases of CrewAI + +- Better for prototyping and quickly testing complex agent interactions. +- Better for automating regular workflows with a defined structure. + +#### Example 1: Multi-stage data processing +```python +data_collector = Agent(role='Collector') +validator = Agent(role='Validator') +transformer = Agent(role='Transformer') +crew = Crew(agents=[data_collector, validator, transformer]) + +result = crew.kickoff() +``` + +#### Example 2: Tiered support system + +```python +first_line = Agent(role='InitialResponse') +specialist = Agent(role='TechnicalSupport') +escalation = Agent(role='EscalationManager') +support_crew = Crew(agents=[first_line, specialist, escalation]) + +result = support_crew.kickoff() +``` + +_You might find these helpful: Python Package_ --- -### [AutoGen](https://microsoft.github.io/autogen/0.2/) — Autonomous problem-solving +## AutoGen — Autonomous problem-solving `free` `open-source` -AutoGen is a framework developed by Microsoft, allowing developers to create AI agents that can interact with each other and with humans to solve complex tasks. These agents can be customized to perform specific roles or have particular expertise: +Microsoft's AutoGen is a framework developed by Microsoft, allowing developers to create AI agents +that can interact with each other and with humans to solve complex tasks. These agents +can be customized to perform specific roles or have particular expertise: -1. code execution for tasks involving programming or data analysis. -2. conversational approach to problem-solving, where agents can discuss, plan, and execute tasks iteratively. -3. manage the flow of multi-agent interactions by determining when a task is complete. - -In AutoGen, you can assign specific roles to agents so they can engage in conversations or interact with each other. A conversation consists of a series of messages exchanged between agents, which can then be used to advance a task. +1. code execution for tasks involving programming or data analysis. +2. conversational approach to problem-solving, where agents can discuss, plan, and execute tasks iteratively. +3. manage the flow of multi-agent interactions by determining when a task is complete. +In AutoGen, you can assign specific roles to agents so they can engage in conversations or interact with each other. A conversation consists of a series of messages exchanged between agents, which can then be used to advance a task. ### AutoGen configuration example @@ -110,61 +133,28 @@ joe = ConversableAgent( ) ``` -*Note: Always check [AutoGen's official documentation](https://microsoft.github.io/autogen/0.2/docs/tutorial/introduction/#agents) for the most up-to-date information and best practices.* - ---- - -## Use Cases - -### CrewAI - -- Better for prototyping and quickly testing complex agent interactions. -- Better for automating regular workflows with a defined structure. - - -#### Example 1: Multi-stage data processing - -```python -data_collector = Agent(role='Collector') -validator = Agent(role='Validator') -transformer = Agent(role='Transformer') -crew = Crew(agents=[data_collector, validator, transformer]) - -result = crew.kickoff() -``` - -#### Example 2: Tiered support system +_Note: Always check AutoGen's official documentation for the most up-to-date information and best practices._ -```python -first_line = Agent(role='InitialResponse') -specialist = Agent(role='TechnicalSupport') -escalation = Agent(role='EscalationManager') -support_crew = Crew(agents=[first_line, specialist, escalation]) - -result = support_crew.kickoff() -``` - - -### AutoGen +### Use Cases of AutoGen - Preferred for tasks requiring precise control over information processing and API access. - Better for one-time, complex problem-solving where the solution approach is unclear. #### Example 1: Single agent performing data retrieval -```python +```python news_agent = AssistantAgent(name="news_agent") # retrieve top 10 technology news headlines user_proxy = UserProxyAgent(name="user_proxy") # Initialize the user proxy agent to simulate user interactions -user_proxy.initiate_chat(news_agent) # start the conversation +user_proxy.initiate_chat(news_agent) # start the conversation ``` #### Example 2: Multi-agent collaboration for data analysis -```python +```python data_retriever = AssistantAgent(name="data_retriever") # retrieve stock data data_analyst = AssistantAgent(name="data_analyst") # analyze stock data and provide insights user_proxy = UserProxyAgent(name="user_proxy") # Initialize the user proxy agent to simulate user interactions -user_proxy.initiate_chat(data_retriever) # start the conversation +user_proxy.initiate_chat(data_retriever) # start the conversation user_proxy.initiate_chat(data_analyst) ``` @@ -172,38 +162,31 @@ user_proxy.initiate_chat(data_analyst) ## Which framework is better for beginners, CrewAI or AutoGen? -For beginners, CrewAI is generally considered the more accessible and easier-to-use framework compared to AutoGen. Here’s why: +For beginners, **CrewAI is generally considered the more accessible** and easier-to-use framework compared to AutoGen. Here’s why: 1. faster setup process and more straightforward to getting started 2. documentation contains examples, which is particularly beneficial for beginners 3. higher level of abstraction, helps beginners quickly prototype and explore multi-agent interactions without delving too deeply into complex setups. - -## Developer Opinions +### Developer Opinions Many developers find that the choice between CrewAI and AutoGen depends on the specific project requirements: - Some prefer AutoGen for its ability to run multiple models concurrently. - Others appreciate CrewAI's integration with LangChain and its broader ecosystem support. -- Some recommend using CrewAI if you know how to solve a problem and want to automate the process, and AutoGen if you want the agent to come up with a solution for you. +- Some recommend using CrewAI if you know how to solve a problem and want to automate the process, and AutoGen if you want the agent to come up with a solution for you. - Both frameworks are seen as valuable tools, with the choice often coming down to the particular use case and development needs. ---- - ## Bottom Line -It's worth noting that the field of AI agent frameworks is competitive and rapidly evolving, with alternatives like LlamaIndex and LangChain. We encourage you to explore multiple options to find the best fit for your use case, and stay updated with the latest advancements in the field. +It's worth noting that there are many awesome AI agent frameworks out there and they are rapidly evolving. We encourage you to explore multiple options to find the best fit for your use case, and stay updated with the latest advancements in the field. Always be sure to check the official documentation for the most up-to-date information and best practices. +### Other Comparisons? -### You might find these helpful: - -- CrewAI Documentation, CrewAI Python Package -- AutoGen Documentation -- 6 Awesome Frameworks for Building AI Agents (Open-Source) - - -### Questions or feedback? +- Comparing CrewAI and Dify AI +- Comparing LlamaIndex and LangChain +- Comparing Open-source AI Agent Builders -Are the information out of date? Do you have additional platforms to add? Please raise an issue and we’d love to share your insights! + diff --git a/bifrost/app/blog/blogs/crewai-vs-dify-ai/src.mdx b/bifrost/app/blog/blogs/crewai-vs-dify-ai/src.mdx index dbfa55f46f..cba2477748 100644 --- a/bifrost/app/blog/blogs/crewai-vs-dify-ai/src.mdx +++ b/bifrost/app/blog/blogs/crewai-vs-dify-ai/src.mdx @@ -12,7 +12,7 @@ In this blog, we will explore the key differences between **CrewAI** and **Dify* --- -# Comparing CrewAI and Dify +## Comparing CrewAI and Dify | **Criteria** | **CrewAI** | **Dify** | | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -29,7 +29,7 @@ In this blog, we will explore the key differences between **CrewAI** and **Dify* --- -# CrewAI +## What is CrewAI? CrewAI is a multi-agent automation tool for builing AI agentic workflows. CrewAI's comprehensive tools **simplify building, managing, and deploying AI agents**. @@ -60,11 +60,11 @@ integrated with external tools to improve functionality. _For up-to-date pricing information, we recommend checking CrewAI's official website._ -## How to Build an AI Agent in CrewAI +### How to Build an AI Agent in CrewAI To build a CrewAI agent, you can either use **YAML configuration** (as CrewAI recommends) or **define them directly** in code. -### Example: YAML Configuration +#### Example: YAML Configuration ```python # src/latest_ai_development/config/agents.yaml @@ -93,7 +93,7 @@ _For the method using direct definition, please refer to Dify's official website._ -## How to Build an AI Agent in Dify +### How to Build an AI Agent in Dify There are two methods. You can find a template by going to the `Explore` section, or create your own custom agents in the `Studio` section. -### Method 1: Start with a Dify template +#### Method 1: Start with a Dify template ![Start with a Dify template - No-code AI Agent Builder](/static/blog/crewai-vs-dify-ai/dify-ai-templates.webp) -### Method 2: Build your own custom agent +#### Method 2: Build your own custom agent ![Build your own custom Dify agent - No-code AI Agent Builder](/static/blog/crewai-vs-dify-ai/dify-ai-custom.webp) @@ -136,7 +136,7 @@ _For the most up-to-date instructions, please check [Dify's documentation](https --- -## Which is the Best Platform to Build LLM Agents? +## Dify and CrewAI: Which is better to build AI Agents? - **For building a multi-agent system with error-handling,** **CrewAI's** APIs and LangChain integration make it the better choice. - **For rapid prototyping,** **Dify** is the better option for its no-code Studio with pre-built templates that delivers a faster turnaround. @@ -191,9 +191,7 @@ Send a request to your CrewAI or Dify agent and see the logs in Helicone! ![Helicone AI - The Best LLM Agent Monitoring Platform](/static/blog/crewai-vs-dify-ai/dashboard.webp) ---- - -# Bottom Line +## Bottom Line Both CrewAI and Dify offer strengths that tailors to different needs and technical goals. As the technology evolves, we encourage you to explore [multiple options](https://www.helicone.ai/blog/ai-agent-builders) to find the best fit for your use case, and stay updated with the latest advancements in the field. @@ -223,12 +221,6 @@ Here's a reminder to always check the official documentation for the most up-to- 6 Awesome Open-Source AI Agent Frameworks ---- - -## Questions or feedback? - -Are the information out of date? Do you have additional platforms to add? Please raise an issue and we’d love to share your insights! - + + diff --git a/bifrost/app/blog/blogs/custom-properties/src.mdx b/bifrost/app/blog/blogs/custom-properties/src.mdx index f15cc324dd..2b3176921c 100644 --- a/bifrost/app/blog/blogs/custom-properties/src.mdx +++ b/bifrost/app/blog/blogs/custom-properties/src.mdx @@ -1,4 +1,3 @@ - In today's digital landscape, understanding your users' preferences and behaviors is crucial to delivering tailored services. Every interaction, click, and engagement offers valuable insights into what your user needs and desires, **but harnessing this data effectively requires the right tools and approaches.** **This is where Helicone's Custom Properties come into play.** @@ -133,3 +132,5 @@ Greptile segments requests by repository (**`Helicone-Property-Repository`**) to → **Doc:** [Setting up custom properties](https://docs.helicone.ai/features/advanced-usage/custom-properties). → **Step-by-step guide:** [Using Custom Properties to Segment Data](https://docs.helicone.ai/use-cases/segmentation). + + diff --git a/bifrost/app/blog/blogs/debugging-chatbots-and-ai-agents-with-sessions/src.mdx b/bifrost/app/blog/blogs/debugging-chatbots-and-ai-agents-with-sessions/src.mdx index f8fbeb982b..3900388256 100644 --- a/bifrost/app/blog/blogs/debugging-chatbots-and-ai-agents-with-sessions/src.mdx +++ b/bifrost/app/blog/blogs/debugging-chatbots-and-ai-agents-with-sessions/src.mdx @@ -2,21 +2,21 @@ When does your AI agent start hallucinating in the multi-step process? Have you ![Debugging RAG Chatbots and AI Agents with Sessions](/static/blog/sessions-ui.webp) -**These are common questions we faced when building our own AI agents and Retrieval Augmented Generation (RAG) chatbots**. Getting reliable responses and minimizing errors like hallucination was incredibly challenging, without visibility into how our users interacted with our large language models. +These are common questions we faced when building our own RAG-powered chatbots and AI agents. Getting reliable responses and minimizing errors like hallucination was incredibly challenging, without visibility into how our users interacted with our large language models. -In this blog, we will delve into examples of how to maintain context, reduce errors, and improve the overall performance of your LLM apps, and share a list of tools to help you create more robust and reliable AI agents. +In this blog, we will delve into examples of how to maintain context, reduce errors, and improve the overall performance of your LLM apps, and share a list of tools to help you create more robust and reliable AI agents. -### What you will learn: +## What you will learn: -- AI agents vs. traditional software -- Components of an AI agent +- AI agents vs. traditional software +- Components of an AI agent - Challenges we faced while debugging AI agents - Effective debugging tools -- How different industries debug AI agents using **Helicone's Sessions** +- How different industries debug AI agents using Sessions --- -## First, what is different about AI agents? +## How are AI agents different from traditional chatbots? Unlike traditional chatbots or software which follow explicit instructions or rules, AI agents can autonomously perform specific tasks with advanced decision-making abilities. They interact with their environment by collecting data, processing it, and deciding on the best actions to achieve a predefined goal. @@ -28,40 +28,36 @@ Copilots help users by **providing suggestions #### Autonomous Agents -Autonomous agents perform tasks **independently without human intervention**. For example, it can handle customer inquiries by identify issues, access account information, perform necessary actions (like processing refunds or updating account details), and respond to the customer. They can also escalate to a human agent if they encounter problems beyond their current capabilities. +Autonomous agents perform tasks **independently without human intervention**. For example, it can handle customer inquiries by identify issues, access account information, perform necessary actions (like processing refunds or updating account details), and respond to the customer. They can also escalate to a human agent if they encounter problems beyond their current capabilities. #### Multi-Agent Systems Multi-agent systems involve **interactions and collaboration between multiple autonomous agents** to achieve a collective goal. These systems have advantages like dynamic reasoning, the ability to distribute tasks, and better memory for retaining information. +## Using Retrieval-Augmented Generation to Improve Functionality -## Using Retrieval-Augmented Generation to Improve Functionality - -**Retrieval-Augmented Generation (RAG)** is an advanced framework that allowed the agent to **incorporate information from external knowledge bases** (e.g., databases, documents, articles) into the response. +**Retrieval-Augmented Generation (RAG)** is an advanced framework that allowed the agent to **incorporate information from external knowledge bases** (e.g., databases, documents, articles) into the response. RAG significantly improved the response outcome as the agent now have access to the most recent data based on keywords, semantic similarity, or other advanced search techniques, and used it to generate more accurate, personalized, and context-specific responses. ---- - ## Components of AI Agents Typically, AI agents consists of four core components: -1. **Planning** -2. **Tool / Vector Database Calls** -3. **Perception** +1. **Planning** +2. **Tool / Vector Database Calls** +3. **Perception** 4. **Memory** ![How AI Agents work](/static/blog/how-agents-work.webp) - ### Planning When you define a goal, AI agents have the ability to plan and sequence actions due to their integration with LLMs that allows them to formulate better strategies. ### Tool / Vector Database Calls -Advanced AI Agents can interact with external tools, APIs, and services through function calls in order to handle more complicated operations such as: +Advanced AI Agents can interact with external tools, APIs, and services through function calls in order to handle more complicated operations such as: - Fetching real-time information from APIs (e.g., weather data, stock prices). - Using translation services to convert text between languages. @@ -76,9 +72,6 @@ AI agents can also perceive and process information from their environment, maki AI agents are able to remember past interactions, including tools previously used and its planning decisions. These experiences are stored to help agents self-reflect and inform future actions. - ---- - ## Challenges We Faced While Debugging AI agents **⚠️ Their decision making process is complicated.** @@ -87,13 +80,11 @@ AI agent's adaptive behavior makes their decision paths **an early error can have cascading effects**, so it's difficult to identify their original source without proper session tracking. - ---- +Agents can often make multiple dependent vector database calls within a single session, adding some complexity in tracing the data flow. They can also operate over a longer sessions, where **an early error can have cascading effects**, so it's difficult to identify their original source without proper session tracking. ## Tools for Debugging AI Agents @@ -101,33 +92,31 @@ One way we try to debug agents is by understanding the internal workings of the ### 1. Helicone `open-source` -**Helicone's Sessions** is ideal for teams looking to intuitively visualize agentic workflows. It's catered to both developers building simple and advanced agents that need to group related LLM calls, trace nested agent workflows, quickly identify issues, and track requests, response and metadata to the Vector Database. +Helicone's Sessions is ideal for teams looking to intuitively visualize agentic workflows. It's catered to both developers building simple and advanced agents that need to group related LLM calls, trace nested agent workflows, quickly identify issues, and track requests, response and metadata to the Vector Database. -### 2. AgentOps `open-source` +### 2. AgentOps -**AgentOps** can be a good choice for teams looking for a comprehensive solution to debug AI agents. Despite a less intuitive interface, AgentOps offers comprehensive features for monitoring and managing AI agents. +AgentOps can be a good choice for teams looking for a comprehensive solution to debug AI agents. Despite a less intuitive interface, AgentOps offers comprehensive features for monitoring and managing AI agents. -### 3. Langfuse `open-source` +### 3. Langfuse -**Langfuse** is ideal for developers who prefer self-hosting solutions and have simpler infrastructure needs. It offers features similar to Helicone's and is well-suited for projects with modest scalability requirements or those prioritizing local deployment over cloud-based solutions. +Langfuse is ideal for developers who prefer self-hosting solutions and have simpler infrastructure needs. It offers features similar to Helicone's and is well-suited for projects with modest scalability requirements or those prioritizing local deployment over cloud-based solutions. ### 4. LangSmith -**LangSmith** is ideal for developers working extensively with the LangChain framework as its SDKs and documentation are designed to support developers within this ecosystem best. +LangSmith is ideal for developers working extensively with the LangChain framework as its SDKs and documentation are designed to support developers within this ecosystem best. ### 5. Braintrust -**Braintrust** is a good choice for those focusing on evaluating AI models. It’s an effective solutions for projects where model evaluation is a primary concern and agent tracing is a secondary need. +Braintrust is a good choice for those focusing on evaluating AI models. It’s an effective solutions for projects where model evaluation is a primary concern and agent tracing is a secondary need. ### 6. Portkey -**Portkey** is designed for developers looking for the latest tools to track and debug AI agents. It introduces new features quickly, great for teams needing the newest suite of features and willing to face the occasional reliability and stability issues. +Portkey is designed for developers looking for the latest tools to track and debug AI agents. It introduces new features quickly, great for teams needing the newest suite of features and willing to face the occasional reliability and stability issues. --- - - -## How Different Industries Debug AI Agents Using Sessions +## Debugging AI Agents Using Sessions Across Industries ### Travel: Finding Errors in a Multi-Step Workflow @@ -141,7 +130,6 @@ A travel chatbot assists users through flights, hotels bookings and car rentals. Sessions gives you a complete trace of the booking interaction, where you can pinpoint exactly where users encountered problems. For example, if your users report missing flight confirmations frequently, looking at each session traces can reveal whether the issue came from input parsing errors or glitches with airline APIs. - ### Health & Fitness: Personalize Responses to Match User Intent **Challenge** @@ -154,8 +142,7 @@ A health and fitness chatbot needs to accurately interpret your user's asks in o Traces labelled `LLM` in a Session can show you your user's preferences, so you can adjust the chatbot responses by altering the prompts. If your users ask about strength training over cardio often, you can tweak the prompt to focus on strength training programs. - -### Education: Ensuring Quality and Consistency with Generated Content +### Education: Ensuring Quality and Consistency with Generated Content **Challenge** @@ -167,24 +154,17 @@ An AI agent that creates customized learning materials needs to generate both ac A Session outlines the structure of the generated course. Each trace in a Session shows you how the agent interpreted your requests and the corresponding content. Skimming through, wherever the agent misunderstood topics or failed to cover key concepts, you can then fine-tune that specific prompt to generate a more thorough content while making sure it is appropriate for the student’s learning level. ---- - -## Next, Become Production-Ready +## Building Production-Ready AI Agents -We're already seeing AI agents in action across various fields like customer service, travel, health and fitness, as well as education. However, for AI agents to be truly production-ready and widely adopted, we need to continue to improve their reliability and accuracy. +We're already seeing AI agents in action across various fields like customer service, travel, health and fitness, as well as education. However, for AI agents to be truly production-ready and widely adopted, we need to continue to improve their reliability and accuracy. This requires us to actively monitor their decision-making processes and get a deep understanding of how inputs influence outputs. The most effective way is by using monitoring tools that provide you the insights to make sure your AI agents consistently deliver the results you want. +### If you want to give Helicone a try, here are some resources we recommend: -**If you want to give Helicone a try, here are some resources we recommend:** - -- Doc: Setting up Helicone's Sessions -- Resource: 6 Open-Source Frameworks for Building AI Agents -- Doc: How to log Vector DB interactions using Helicone's Javascript SDK -- Guide: How to Optimize AI Agents by Replaying LLM Sessions - ---- - -### Questions or feedback? +- Doc: Setting up Helicone's Sessions +- Resource: 6 Open-Source Frameworks for Building AI Agents +- Doc: How to log Vector DB interactions using Helicone's Javascript SDK +- Guide: How to Optimize AI Agents by Replaying LLM Sessions -Are the information out of date? Do you have additional platforms to add? Please raise an issue and we’d love to share your insights! \ No newline at end of file + diff --git a/bifrost/app/blog/blogs/essential-helicone-features/src.mdx b/bifrost/app/blog/blogs/essential-helicone-features/src.mdx index 06f6aae555..9585aadcfb 100644 --- a/bifrost/app/blog/blogs/essential-helicone-features/src.mdx +++ b/bifrost/app/blog/blogs/essential-helicone-features/src.mdx @@ -2,17 +2,16 @@ Helicone empowers AI engineers and LLM developers to optimize their applications' performance. This guide provides step-by-step instructions for integrating and making the most of Helicone’s features — **available on all Helicone plans**. -![4 essential features in Helicone to optimize your AI app](/static/blog/4-essential-features/cover.webp) +![4 essential features in Helicone to optimize your AI app](/static/blog/4-essential-features/cover.webp) - -**This blog post is for you if:** +**This blog post is for you if:** - You're building or maintaining an AI application - You need to improve response times, reduce costs, or enhance reliability - You want data-driven insights to guide your optimization efforts - You're looking for practical, implementable solutions -**We will focus on the 4 essential Helicone features: custom properties, sessions, prompts, and caching,** how each feature works, why it matters, and how to implement it in your development workflow. +**We will focus on the 4 essential Helicone features: custom properties, sessions, prompts, and caching,** how each feature works, why it matters, and how to implement it in your development workflow. If you're ready to follow the practical steps with zero fluff, read on. @@ -20,12 +19,12 @@ If you're ready to follow the practical steps with zero fluff, read on. ## Getting started: integrating with 1-line of code -Whether you're prototyping or maintaining a production app, Helicone's one-line integration lets you focus on building, not configuring. +Whether you're prototyping or maintaining a production app, Helicone's one-line integration lets you focus on building, not configuring. ### Integrating with any provider - Change a single line of code to integrate Helicone with your AI app. -- Compatible with various AI models and APIs. Here's the entire list of integrations. +- Compatible with various AI models and APIs. Here's the entire list of integrations. - Update the base URL to easily switch between models (e.g., GPT-4 to LLaMA). ```python @@ -40,7 +39,7 @@ Whether you're prototyping or maintaining a production app, Helicone's one-line ## #1: Custom properties: segmenting your requests -Custom Properties helps you tailor the LLM analytics to your needs. Custom Properties lets you **segment requests**, allowing you to make more data-driven improvements and targeted optimizations. +Custom Properties helps you tailor the LLM analytics to your needs. Custom Properties lets you **segment requests**, allowing you to make more data-driven improvements and targeted optimizations. ### In a nutshell @@ -49,42 +48,40 @@ Custom Properties helps you tailor the LLM analytics to your needs. Custom Prope ### How it works -1. Add custom headers to your requests using this format: `Helicone-Property-[Name]: [value]` where `Name`is the name of your custom property. For example: - - ```python - headers = { - "Helicone-Property-Session": "121", - "Helicone-Property-App": "mobile", - "Helicone-Property-Conversation": "support_issue_2" - } - ``` - More info about Custom Properties in the docs. - -2. Now you can segment incoming requests on the **Dashboard**, **Requests**, or **Properties** page. For example: +1. Add custom headers to your requests using this format: `Helicone-Property-[Name]: [value]` where `Name`is the name of your custom property. For example: + + ```python + headers = { + "Helicone-Property-Session": "121", + "Helicone-Property-App": "mobile", + "Helicone-Property-Conversation": "support_issue_2" + } + ``` + + More info about Custom Properties in the docs. - - **Use case 1:** Filter for specific prompt chains on Requests page to analyze costs and latency. - - ![Filter by custom properties on Helicone's Request page](/static/blog/4-essential-features/request-page.webp) - - - **Use case 2:** Analyze "unit economics" (e.g., average cost per conversation) on Properties page. +2. Now you can segment incoming requests on the **Dashboard**, **Requests**, or **Properties** page. For example: - ![Analyze unit economics using custom properties in Helicone](/static/blog/4-essential-features/properties-page.webp) +- **Use case 1:** Filter for specific prompt chains on Requests page to analyze costs and latency. + ![Filter by custom properties on Helicone's Request page](/static/blog/4-essential-features/request-page.webp) +- **Use case 2:** Analyze "unit economics" (e.g., average cost per conversation) on Properties page. - - **Use case 3:** Filter for all requests that meet a criteria on Dashboard page. + ![Analyze unit economics using custom properties in Helicone](/static/blog/4-essential-features/properties-page.webp) - ![Segment requests and metrics by custom properties on Helicone's dashboard](/static/blog/4-essential-features/dashboard-page.webp) +- **Use case 3:** Filter for all requests that meet a criteria on Dashboard page. + ![Segment requests and metrics by custom properties on Helicone's dashboard](/static/blog/4-essential-features/dashboard-page.webp) ### Real-life examples -- Segment your requests by app versions to monitor and compare performance. -- Segment your requests by free/paid users to better understand their behaviours and patterns. -- Segment your requests by feature to understand usage pattern and optimize resource allocation. +- Segment your requests by app versions to monitor and compare performance. +- Segment your requests by free/paid users to better understand their behaviours and patterns. +- Segment your requests by feature to understand usage pattern and optimize resource allocation. ### Additional reading -- **Docs:** Custom Properties -- **Advanced:** How to understand your users better and deliver a top-tier experience with Custom Properties +- **Docs:** Custom Properties +- **Advanced:** How to understand your users better and deliver a top-tier experience with Custom Properties --- @@ -94,43 +91,35 @@ Helicone's Sessions feature allows developers to group and visualize multi-step ### In a nutshell -- You can group related requests for a more holistic analysis. -- You can track request flows across multiple traces. -- You can implement tracing with just three headers. - +- You can group related requests for a more holistic analysis. +- You can track request flows across multiple traces. +- You can implement tracing with just three headers. ### How it works -1. Add the following three headers to your requests. Here's the doc on how to enable Sessions. - - ```python - headers = { - "Helicone-Session-Id": session_uuid, # The session id you want to track - "Helicone-Session-Path": "/abstract", # The path of the session - "Helicone-Session-Name": "Course Plan" # The name of the session - } - ``` +1. Add the following three headers to your requests. Here's the doc on how to enable Sessions. -2. Use the Helicone dashboard to visualize and analyze your sessions. For example: + ```python + headers = { + "Helicone-Session-Id": session_uuid, # The session id you want to track + "Helicone-Session-Path": "/abstract", # The path of the session + "Helicone-Session-Name": "Course Plan" # The name of the session + } + ``` - - **Use case 1:** Reconstruct conversation flows or multi-stage tasks in the `Chat` view - - ![Reconstruct conversation flows or multi-stage in chat view](/static/blog/4-essential-features/convo-view.webp) - - - **Use case 2:** Analyze performance across the entire interaction sequence in the `Tree` view - - ![Analyze performance across entire interaction sequences in tree view](/static/blog/4-essential-features/tree-view.webp) - - - **Use case 3:** Identify bottlenecks in your AI workflows in the `Span` view - - ![Identify bottlenecks in your AI workflows in span view](/static/blog/4-essential-features/span-view.webp) - - - **Use case 4:** Gain deeper insights into user behavior with conversation context. +2. Use the Helicone dashboard to visualize and analyze your sessions. For example: +- **Use case 1:** Reconstruct conversation flows or multi-stage tasks in the `Chat` view + ![Reconstruct conversation flows or multi-stage in chat view](/static/blog/4-essential-features/convo-view.webp) +- **Use case 2:** Analyze performance across the entire interaction sequence in the `Tree` view + ![Analyze performance across entire interaction sequences in tree view](/static/blog/4-essential-features/tree-view.webp) +- **Use case 3:** Identify bottlenecks in your AI workflows in the `Span` view + ![Identify bottlenecks in your AI workflows in span view](/static/blog/4-essential-features/span-view.webp) +- **Use case 4:** Gain deeper insights into user behavior with conversation context. ### Real-life example -Imagine creating an AI app that creates a course outline. +Imagine creating an AI app that creates a course outline. ```python const session = randomUUID(); @@ -157,11 +146,11 @@ openai.chat.completions.create( This setup allows you to track the entire course creation process, from abstract to detailed lessons, as a single session. -For developers working on applications with complex, multi-step AI interactions, Sessions provides a powerful tool for understanding and optimizing your AI workflows. +For developers working on applications with complex, multi-step AI interactions, Sessions provides a powerful tool for understanding and optimizing your AI workflows. ### Additional reading -- **Docs:** Sessions +- **Docs:** Sessions --- @@ -171,31 +160,30 @@ Helicone's Prompt Management feature offers developers a powerful tool to versio ### In a nutshell -- Helicone will automatically version your prompt whenever it's modified in the codebase. -- You can run experiments using past requests (grouped into a dataset). -- You can test your prompts with Experiments to prevent prompt regressions. +- Helicone will automatically version your prompt whenever it's modified in the codebase. +- You can run experiments using past requests (grouped into a dataset). +- You can test your prompts with Experiments to prevent prompt regressions. ### How it works -1. Set up Helicone in proxy mode. Use one of the methods in the Starter Guide. +1. Set up Helicone in proxy mode. Use one of the methods in the Starter Guide. 2. Use the `hpf` (Helicone Prompt Format) function to identify input variables - - ```python - import { hpf } from "@helicone/prompts"; - - ... - - content: hpf`Write a story about ${{ character }}`, - ``` - -3. Assign a unique ID to your prompt using a header. Here's the doc on Prompt Management & Experiments. For example: - - ```python - headers: { - "Helicone-Prompt-Id": "prompt_story", - }, - ``` - + + ```python + import { hpf } from "@helicone/prompts"; + + ... + + content: hpf`Write a story about ${{ character }}`, + ``` + +3. Assign a unique ID to your prompt using a header. Here's the doc on Prompt Management & Experiments. For example: + + ```python + headers: { + "Helicone-Prompt-Id": "prompt_story", + }, + ``` ### Example implementation @@ -236,13 +224,13 @@ Imagine you are developing a chatbot and want to improve its responses. With pro 3. Analyze performance metrics for each version 4. Deploy the best-performing prompt to production -The Prompt & Experiment feature is not only for developers to iterate and experiment with prompts, but also enables non-technical team members to participate in prompt design without touching the codebase. +The Prompt & Experiment feature is not only for developers to iterate and experiment with prompts, but also enables non-technical team members to participate in prompt design without touching the codebase. ### Additional reading -- **Docs:** Prompts & Experiments -- Choosing a prompt management tool | Helicone -- How to run LLM Prompt Experiment | Helicone +- **Docs:** Prompts & Experiments +- **Blog:** Choosing a prompt management tool | Helicone +- **Blog:** How to run LLM Prompt Experiment | Helicone --- @@ -259,31 +247,30 @@ Helicone's LLM Caching feature offers developers a powerful way to reduce latenc ### How it works 1. Enable caching with a simple header: - - ```python - headers = { - "Helicone-Cache-Enabled": "true" - } - ``` - -2. Customize caching behavior. For detailed description on how to configure the headers, visit the doc. - - ```python - headers = { - "Helicone-Cache-Enabled": "true", - "Cache-Control": "max-age=3600", # 1 hour cache - "Helicone-Cache-Bucket-Max-Size": "3", - "Helicone-Cache-Seed": "user-123" - } - ``` - + + ```python + headers = { + "Helicone-Cache-Enabled": "true" + } + ``` + +2. Customize caching behavior. For detailed description on how to configure the headers, visit the doc. + + ```python + headers = { + "Helicone-Cache-Enabled": "true", + "Cache-Control": "max-age=3600", # 1 hour cache + "Helicone-Cache-Bucket-Max-Size": "3", + "Helicone-Cache-Seed": "user-123" + } + ``` + 3. (optional) Extract cache status from response headers: - - ```python - cache_hit = response.headers.get('Helicone-Cache') - cache_bucket_idx = response.headers.get('Helicone-Cache-Bucket-Idx') - ``` - + + ```python + cache_hit = response.headers.get('Helicone-Cache') + cache_bucket_idx = response.headers.get('Helicone-Cache-Bucket-Idx') + ``` ### Benefits @@ -296,10 +283,10 @@ Helicone's LLM Caching feature offers developers a powerful way to reduce latenc Imagine you're building a customer support chatbot. With LLM Caching: - 1. Common questions get instant responses from cache - 2. You save on API costs for repetitive queries - 3. Your app maintains consistent responses for similar inputs - 4. You can analyze cache hits to identify popular topics +1. Common questions get instant responses from cache +2. You save on API costs for repetitive queries +3. Your app maintains consistent responses for similar inputs +4. You can analyze cache hits to identify popular topics ### Example implementation @@ -337,11 +324,6 @@ If you are an building with AI, LLM observability tools can help you: - ✅ Gain deeper insights into your user interactions - ✅ Debug AI development workflows -Helicone aims to provide all the essential tools to help you make the right improvements and deliver better AI experiences. Interested in checking out other features? Here's a list of headers to get you started. Happy optimizing! - - ---- - -## Questions? +Helicone aims to provide all the essential tools to help you make the right improvements and deliver better AI experiences. Interested in checking out other features? Here's a list of headers to get you started. Happy optimizing! -Join our Discord or email us! \ No newline at end of file + diff --git a/bifrost/app/blog/blogs/first-ai-app-with-helicone/src.mdx b/bifrost/app/blog/blogs/first-ai-app-with-helicone/src.mdx index be85fe0223..cb28b6a747 100644 --- a/bifrost/app/blog/blogs/first-ai-app-with-helicone/src.mdx +++ b/bifrost/app/blog/blogs/first-ai-app-with-helicone/src.mdx @@ -1,26 +1,22 @@ - ![Our Designer made an AI app with Helicone](/static/blog/first-ai-app/lina-first-ai-app.webp) -Heya! My name is Lina, I'm a product designer at Helicone and have recently joined the team. +Heya! My name is Lina, I'm a product designer at Helicone and have recently joined the team. -I've always been interested in designing for people and creating delight in digital experiences, but I was rarely on the technical implementation side. As the first non-technical member at Helicone, my main goal was to understand the product deeply. +I've always been interested in designing for people and creating delight in digital experiences, but I was rarely on the technical implementation side. As the first non-technical member at Helicone, my main goal was to understand the product deeply. -**So I decided to make my first AI app** - in the spirit of getting a first-hand exposure to our user's pain points. +**So I decided to make my first AI app** - in the spirit of getting a first-hand exposure to our user's pain points. ## So, what's the app? Over the course of a week, I bothered Justin (CEO of Helicone) and begged him to teach me how to create an **Emoji Translator** - an app that uses AI to interpret and translate text messages into emoji expressions. The app will suggest relevant emojis based on the sentiment you want to convey. Eventually, you can send emojis back and forth with your friends - this is just for fun by the way, and probably doesn't solve any actual problems. - ![Emoji translater](/static/blog/first-ai-app/emoji-converter.webp) - - ## The First Impression -I integrated it with Helicone to see what types of texts my friends are sending to the Emoji Translator, just for fun. +I integrated it with Helicone to see what types of texts my friends are sending to the Emoji Translator, just for fun. -As a non-technical user, whenever I encounter a product that requires me to use the command line or touch the codebase, I like it a lot less. *I worried about having to install SDKs and go through lengthy onboarding to get started.* +As a non-technical user, whenever I encounter a product that requires me to use the command line or touch the codebase, I like it a lot less. _I worried about having to install SDKs and go through lengthy onboarding to get started._ **Integration** @@ -30,49 +26,42 @@ As a non-technical user, whenever I encounter a product that requires me to use **Onboarding** -During onboarding, I sent my first request and Helicone made sure the request was received before moving on. I like having the confirmation of a successful integration. - +During onboarding, I sent my first request and Helicone made sure the request was received before moving on. I like having the confirmation of a successful integration. ![Helicone listening to events during onboarding to make sure everything is integrated correctly.](/static/blog/first-ai-app/listening-for-events.gif) +## Feature Integration +I decided to integrate with every Helicone feature, just for fun. I'll walk through my top three, starting with _Prompts_. -## Feature Integration - -I decided to integrate with every Helicone feature, just for fun. I'll walk through my top three, starting with *Prompts*. - -### 1. Prompts +### 1. Prompts **Problem: I want to tweak my prompt and test it with some sample inputs without changing my code in production. ** -**Solution:** Helicone automatically tracked my current prompt in production. Under the "Prompts" tab, I was able to experiment with a different version of my prompt, test it with a different dataset and with a different model (doc). - +**Solution:** Helicone automatically tracked my current prompt in production. Under the "Prompts" tab, I was able to experiment with a different version of my prompt, test it with a different dataset and with a different model (doc). -**Impact:** What was cool was being able to see how the new prompt compares to the production prompt, and see the output in almost split seconds without having to touch my code. Once I was happy with the updated prompt, that's when I updated the prompt in my codebase. +**Impact:** What was cool was being able to see how the new prompt compares to the production prompt, and see the output in almost split seconds without having to touch my code. Once I was happy with the updated prompt, that's when I updated the prompt in my codebase. ![Helicone's Experiment feature tracks prompt version automatically and allows you to test prompts without touching production code.](/static/blog/first-ai-app/compare-experiments.webp) -Now, whenever I change my prompt in production, Helicone detects it automatically and keeps track of the version history for me. *The prompts and experiments feature could be really useful to do prompt regression test more easily.* - - +Now, whenever I change my prompt in production, Helicone detects it automatically and keeps track of the version history for me. _The prompts and experiments feature could be really useful to do prompt regression test more easily._ ### 2. Custom Properties **Problem: Every time I change my prompt, I send a ton of requests to test the output. For debugging purpose, I need a way to filter only my requests. ** -**Solution:** I added a `User` custom property and assigned my name to all the requests sent by me. As soon as the requests were received, I'm able to filter all requests by the user property on the "Requests" page. +**Solution:** I added a `User` custom property and assigned my name to all the requests sent by me. As soon as the requests were received, I'm able to filter all requests by the user property on the "Requests" page. ![Helicone's Custom Properties allows you to filter and segment your data easily.](/static/blog/first-ai-app/custom-properties.webp) -As a bonus, I'm able to view my detailed usage when I head to the “Users” tab which includes the `total cost`, `number of requests made` and a log of requests made by me, in this case (doc). +As a bonus, I'm able to view my detailed usage when I head to the “Users” tab which includes the `total cost`, `number of requests made` and a log of requests made by me, in this case (doc). ![Helicone's User Metrics show the detailed metrics by users.](/static/blog/first-ai-app/user-metrics.webp) -**Impact:** This feature lends itself to let you do a ton of fun things like segmenting requests by user types, especially if you have user tiers like *free* v.s. *premium*, or if your app has multiple features that use the LLM, then you can use custom properties to specify which feature the requests were coming from, or even where in the world the requests are coming from - if you specific the geographical location as a custom property. +**Impact:** This feature lends itself to let you do a ton of fun things like segmenting requests by user types, especially if you have user tiers like _free_ v.s. _premium_, or if your app has multiple features that use the LLM, then you can use custom properties to specify which feature the requests were coming from, or even where in the world the requests are coming from - if you specific the geographical location as a custom property. **→ This is a useful article on [how to use custom properties](https://www.helicone.ai/blog/custom-properties).** - ### 3. Caching **Problem: This is just for fun, but imagine McDonald's customers have to place their orders using emojis only, then having cached responses would be super helpful for these reasons:** @@ -81,64 +70,50 @@ As a bonus, I'm able to view my detailed usage when I head to the “Users” ta - If two people were to order Junior Chicken, I want the output to be the same. - I want the response to be fast, if not INSTANT ⚡. -**Solution:** I can enable cache in Helicone by simply setting it to `true` in the header (doc). There are other parameters I played with, such as: -- `cache-control` where I can limit my cache to just 2 hours. -- `cache-bucket-max-size` that allowed me to limit my bucket size just 10 items (20 items is the max on Helicone's free plan). -- `cache-seed`, which allowed me to restart my cache whenever I have an erroneous output (i.e. when the input "Chicken Nuggets" produces `🍔`). +**Solution:** I can enable cache in Helicone by simply setting it to `true` in the header (doc). There are other parameters I played with, such as: +- `cache-control` where I can limit my cache to just 2 hours. +- `cache-bucket-max-size` that allowed me to limit my bucket size just 10 items (20 items is the max on Helicone's free plan). +- `cache-seed`, which allowed me to restart my cache whenever I have an erroneous output (i.e. when the input "Chicken Nuggets" produces `🍔`). **Impact:** By caching, developers can save a ton of money in the long term. A good use case is a conversational AI that answers frequently asked questions. In addition, caching also makes the user experience sooo much better without having to make the user wait or produce an inaccurate output. - - ### Rating Helicone ![Rating Helicone's Dashboard](/static/blog/first-ai-app/helicone-dashboard.webp) - **Integration** -Helicone was very easy to use, and the fact that you can access all features using Helicone header makes it appealing to commit as a long-term user. In the future, you can likely access new features in this manner. - +Helicone was very easy to use, and the fact that you can access all features using Helicone header makes it appealing to commit as a long-term user. In the future, you can likely access new features in this manner. **User Interface** -The UI is very clean, which I love. It's also surprisingly simple and intuitive to use for a dev tool, making it accessible to non-technical users. - +The UI is very clean, which I love. It's also surprisingly simple and intuitive to use for a dev tool, making it accessible to non-technical users. **Docs** -As a visual learner, I appreciate the step-by-step breakdown and images in the docs, which makes it easy to follow. - - -**Customer Service** +As a visual learner, I appreciate the step-by-step breakdown and images in the docs, which makes it easy to follow. -The Helicone team is fast to respond to inquiries, and work hard to get to everyone's questions. The best way to reach out is by joining the Discord. +**Customer Service** +The Helicone team is fast to respond to inquiries, and work hard to get to everyone's questions. The best way to reach out is by joining the Discord. **Product Experience** Helicone is lightweight and capable of handling billions of requests. Using any feature feels like plug-and-play. I can access almost any feature just by adding a header. I love not having to read lengthy docs to figure out how things work. - ## So, who's Helicone for? -As someone who's not technical, I don't like having to spend a lot of time figuring out technical problems. Having the ability to plug and play was super useful. Helicone is built to be accessible for people with all sorts of coding experience, from beginners / non-techncial users, to indie hackers and enterprises. - +As someone who's not technical, I don't like having to spend a lot of time figuring out technical problems. Having the ability to plug and play was super useful. Helicone is built to be accessible for people with all sorts of coding experience, from beginners / non-techncial users, to indie hackers and enterprises. ## Recommendations -As a member of Helicone's team, I would recommend new Helicone users to: +As a member of Helicone's team, I would recommend new Helicone users to: - **Ask the Helicone team questions!** We're all very friendly. - **Use the docs, and join Discord!** But if the docs are unclear, please let us know. -- **Continue to share feedback and your use cases!** +- **Continue to share feedback and your use cases!** In order to become developer's favourite tools, we need your help. We will continue to listen, and dream up of a way to make monitoring your LLM a breeze. 🍃 - -### Resources - -Contact us about your use case: https://www.helicone.ai/contact - -Find out more about Helicone's use cases: https://docs.helicone.ai/use-cases/experiments + diff --git a/bifrost/app/blog/blogs/google-gemini-exp-1206/src.mdx b/bifrost/app/blog/blogs/google-gemini-exp-1206/src.mdx index df8ba4f25b..9f85cf243b 100644 --- a/bifrost/app/blog/blogs/google-gemini-exp-1206/src.mdx +++ b/bifrost/app/blog/blogs/google-gemini-exp-1206/src.mdx @@ -4,9 +4,7 @@ Google's Gemini-Exp-1206 is quickly making waves in the world of generative AI. In this blog, we will cover key features, performance benchmarks, real world applications of Google Gemini-Exp-1206, and what the hype is all about. ---- - -# Understanding Gemini-Exp-1206 +## Understanding Gemini-Exp-1206 **Gemini-Exp-1206** is the newest large language model (LLM) in Google’s experimental Gemini series, designed to be **multilingual**, **handle multi-modal inputs like text, voice and images, and achieve top-tier performance** across diverse AI tasks. As part of Google's larger strategy to integrate advanced machine learning models into real-world applications, Gemini-Exp-1206 has quickly captured attention for its capabilities across creative, technical, and conversational domains. @@ -16,9 +14,7 @@ Despite being a prototype, Gemini-Exp-1206 has distinguished itself by excelling Gemini-exp-1206 can be accessed in Google AI Studio, and the Gemini API. Developers use Helicone to monitor, debug and improve their LLM apps. ---- - -# Key Performance Metrics +## Key Performance Metrics Gemini-exp-1206 has achieved top rankings on several AI leaderboards, including #1 overall on the Chatbot Arena leaderboard, #2 on coding average and #1 on Mathematical average on the Live Bench leaderboard. @@ -70,9 +66,7 @@ Google has made Gemini-exp-1206 freely available through Google AI Studio and th secondaryButtonLink="https://www.helicone.ai/status/provider/Google" /> ---- - -# How Gemini-Exp-1206 Compares With Other Models +## How Gemini-Exp-1206 Compares With Other Models ### 1. Gemini vs OpenAI’s GPT-4 and o1 @@ -86,9 +80,7 @@ Gemini-Exp-1206 outshines it in advanced reasoning and task generalization. Whil -> **Want to know how Gemini-Exp-1206 compares with other models?** Check out our free model comparison tool. ---- - -# Applications Across Industries +## Applications Across Industries In **software development**, Gemini-Exp-1206’s advanced coding capabilities empower developers to streamline workflows, from automating repetitive tasks like generating boilerplate code to solving intricate debugging challenges. @@ -137,8 +129,4 @@ It's important to note that all AI models comes with strengths and limitations. Llama 3.3 just dropped — is it better than GPT-4 or Claude-Sonnet-3.5? ---- - -## Questions or feedback? - -Are the information out of date? Please raise an issue and we'd love to hear your insights! + diff --git a/bifrost/app/blog/blogs/helicone-recap/src.mdx b/bifrost/app/blog/blogs/helicone-recap/src.mdx index e39db57b25..b627bdbb4b 100644 --- a/bifrost/app/blog/blogs/helicone-recap/src.mdx +++ b/bifrost/app/blog/blogs/helicone-recap/src.mdx @@ -1,10 +1,10 @@ Over the past 6 months, we've been hard at work making Helicone even better for you. Your support and feedback have been invaluable, and we're excited to share our journey and future plans with you. -## What We've Built in the Past Six Months +## A Quick Feature Recap ### 🚀 Sessions -We've made it easier to group and visualize **multi-step LLM interactions**. By adding just **two simple headers**, you can track request flows across multiple traces and gain valuable insights into complex AI workflows. +We've made it easier to group and visualize multi-step LLM interactions. By adding two simple headers, you can track request flows across multiple traces and gain valuable insights into complex AI workflows. ** and ensures your prompts keep getting better. +Run experiments using historical datasets to test, evaluate, and improve your prompts over time. This helps prevent regressions in your production systems and ensures your prompts keep getting better.
**, helping you keep a close eye on your application's performance without missing a beat. +Stay updated with real-time notifications right in Slack! You can now get alerted on errors or costs, helping you keep a close eye on your application's performance without missing a beat.
** requests in your dataset, and export them when you're ready. Create a **golden dataset** to fine-tune your models and easily export it to your favorite fine-tuning platform like OpenPipe. +You can now build datasets from your requests. Add, duplicate, and edit requests in your dataset, and export them when you're ready. Create a golden dataset to fine-tune your models and easily export it to your favorite fine-tuning platform like OpenPipe.
qualitative and quantitative scores** such as tone alignment and Semantic Similarity. They make it easy to compare different prompt variations and help engineers justify prompt changes with data before pushing to production ---- - -# How to properly test your prompts +## How to properly test your prompts Properly testing your prompts involves setting up a systematic workflow that iteratively improves performance. This process ensures you’re equipped to handle dynamic scenarios, minimize errors, and optimize user experience. ![Prompt Evaluation Lifecycle in Helicone](/static/blog/test-your-llm-prompts/prompt-evaluation-lifecycle.webp) _Steps to test your prompt: log > evaluate > experiment > deploy_ -## Preparation +### Preparation -### **Step 1: Log your LLM requests** +### Step 1: Log your LLM requests -Use an observability tool to log your LLM requests and track key metrics like usage, latency, cost, time-to-first-token (TTFT). These tools provide dashboards to help you monitor irregularity, such as: +Use an observability tool like Helicone to log your LLM requests and track key metrics like usage, latency, cost, time-to-first-token (TTFT). These tools provide dashboards to help you monitor irregularity, such as: - Rising error rates - Sudden spikes in API costs. @@ -90,43 +84,41 @@ Use an observability tool to log your LLM requests and track key metrics like us This data helps you identify when it’s time to improve your prompt. -## Testing Process +### Testing Process ### Step 2: Create prompt variations -Experiment with prompt versions using techniques like chain-of-thought reasoning and multi-shot prompting. Testing environments like _Helicone_ makes it easier to track prompt versions, inputs, and outputs, while also providing rollback capabilities if changes lead to underperformance/regression. +Experiment with prompt variations using prompt engineering techniques like chain-of-thought reasoning and multi-shot prompting. Testing environments like _Helicone_ makes it easier to track prompt versions, inputs, and outputs, while also providing rollback capabilities if changes lead to underperformance/regression. ### Step 3: Use real data to generate outputs Run your prompts on real-world datasets to ensure they can handle variability and edge cases. There are 2 options: -- **Golden datasets** with curated inputs with known expected outputs. -- **Randomly sampled production data** which is more representative of real-world scenarios (Here's why this approach is better). +- **Golden datasets** with curated inputs with known expected outputs. +- **Randomly sampled production data** which is more representative of real-world scenarios (Here's why this approach is better). ### Step 4: Compare outputs Evaluate the performance of your prompts using the best methods that suits your goals and capacity, such as: -- **Real user feedback** for subjective insights. -- **Human evaluators** for nuanced assessments. -- **LLM-as-a-judge** for scalable and efficient comparisons. +- Real user feedback for subjective insights. +- Human evaluators (scores)for nuanced assessments. +- LLM-as-a-judge for scalable and efficient comparisons. ### Step 5: Push the best prompt to production Once you've identified the best-performing prompt, deploy it to production. Remember to continue to monitor your application using observability tools to track metrics and identify opportunities for further refinement. ---- - -# How to evaluate prompts before and after deployment +## How to evaluate prompts before and after deployment Evaluating prompts is about assessing how effectively your inputs—such as prompts and context—generate the desired outputs. Unlike generic model benchmarks, evaluations are tailored to your specific use case, providing targeted insights into performance. -## Key evaluation methods +### Key evaluation methods ![Add preset or custom evaluators to score your LLM outputs](/static/blog/test-your-llm-prompts/evaluator.webp) @@ -134,11 +126,11 @@ _Add preset or custom evaluators to score your LLM outputs._ ### 1. Real user feedback -Collect feedback directly from users to gauge how well your LLM performs in real-world scenarios. This can be done through feature implementation to **solicit explicit feedback**, such as thumbs up/down rating or scoring outputs, or by **analyzing implicit user behaviors**, like time spent engaging with responses or completion rates. +Collect feedback directly from users to gauge how well your LLM performs in real-world scenarios. This can be done through feature implementation to solicit explicit feedback or by analyzing implicit user behaviors, like time spent engaging with responses or completion rates. **Bottom Line** -Getting user feedback is very useful for understanding practical challenges, but it can be time-consuming and subjective. However, only when your users use your product over time, will you start receiving feedback to improve upon. +Getting user feedback is very useful for understanding practical challenges, but can be time-consuming and subjective. However, only when your users use your product over time, will you start receiving feedback to improve upon. How can we evaluate prompts before deployment? Let’s look at some alternative evaluation methods. @@ -146,7 +138,7 @@ How can we evaluate prompts before deployment? Let’s look at some alternative Use human reviewers to assess output quality based on specific criteria like relevance, tone, or correctness. This method usually begins with building a test dataset that human evaluators will compare the output against. -Based on the output, evaluators will score the response with `yes/no`, `0-10` (**direct scoring**) or is given a set of LLM responses where the evaluator will pick the better response (**A/B testing or pairwise comparisons**). +Based on the output, evaluators will score the response with `yes/no`, `0-10` (direct scoring) or is given a set of LLM responses where the evaluator will pick the better response (A/B testing or pairwise comparisons). **Bottom Line** @@ -161,7 +153,7 @@ An alternative to human evaluation is using an LLM with an ** -#### Deterministic testing +### Deterministic testing LLM-as-a-judge is useful for scenarios where outputs are predictable and well-defined, such as: @@ -169,7 +161,7 @@ LLM-as-a-judge is useful for scenarios where outputs are predictable and well-de - Structured outputs (e.g., JSON validation). - Constraint checks (e.g., ensuring no profanity or meet specific formatting rules). -#### Tips for success +### Tips for success - Test the evaluation prompt itself: make sure the prompt is clear and accurate. - Use few-shot learning: Include good and bad examples to help guide the evaluation. @@ -195,7 +187,7 @@ An LLM handles **creative tasks** (i.e. stories or marketing copy) will need met 5. Whether the response is aligned with brand guidelines or follow the desired tone. 6. Whether the response adhere to structural requirements (e.g., word count, formatting). -### Choosing the right metrics +## Choosing the right metrics At the end of the day, the choice of evaluation metrics depends on your application’s goals. For example, **faithfulness** is ideal for Retrieval-Augmented Generation (RAG) applications to measure how well the AI response adheres to the provide context, while metrics like **BLEU** or **ROUGE** are better for translation-specific tasks and text-summarization tasks respectively. @@ -211,11 +203,9 @@ To evaluate LLM outputs effectively, start by identifying the specific aspect of - **Creativity:** Is the response imaginative or engaging? - **Consistency:** Does it match prior outputs or user inputs? -### **a) Measuring accuracy** +### a) Measuring accuracy -**Example:** - -A chatbot answering a factual query “What is the population of France?” +**Example:** A chatbot answering a factual query “What is the population of France?” **Recommendations:** @@ -223,22 +213,18 @@ A chatbot answering a factual query “What is the population of France?” - Human evaluation: Use for nuanced questions or when ground truth is unavailable. - Word-level metrics (e.g., BLEU, ROUGE): Measure token overlap between generated and reference outputs for precise tasks. -### **b) Comparing two prompt versions** - -**Example:** +### b) Comparing two prompt versions -Testing whether an updated summarization prompt performs better than the previous version. +**Example:** Testing whether an updated summarization prompt performs better than the previous version. **Recommendations:** - A/B testing: Put outputs side-by-side for human evaluators or users to pick the better one. - LLM-as-a-judge: Automate A/B testing by using a carefully-designed evaluation prompt. -### **c) Evaluating edge cases or critical outputs** +### c) Evaluating edge cases or critical outputs -**Example:** - -Assessing a medical assistant recommending treatment options. +**Example:** Assessing a medical assistant recommending treatment options. **Recommendations:** @@ -246,22 +232,18 @@ Assessing a medical assistant recommending treatment options. - Reference scoring: Use authoritative sources to build a dataset as benchmarks. - A/B testing: Experiment with a modified prompt or tweak model parameters to improve accuracy. -### **d) Measuring usability** - -**Example:** +### d) Measuring usability -A virtual assistant handling user queries in a help desk scenario. +**Example:** A virtual assistant handling user queries in a help desk scenario. **Recommendations:** - Real user feedback: Analyze explicit ratings (e.g., thumbs up/down) and implicit behaviors (e.g., engagement time or rejection rates). - Human evaluation: Pre-deployment tests for adherence to tone, accuracy, and helpfulness. -### **e) Assessing creative outputs** +### e) Assessing creative outputs -**Example:** - -Generating a poetry or brainstorming a story idea. +**Example:** Generating a poetry or brainstorming a story idea. **Recommendations:** @@ -269,8 +251,6 @@ Generating a poetry or brainstorming a story idea. - Human evaluation: Crucial for subjective tasks where creativity and engagement matter most. - LLM-as-a-judge: Automate creative evaluations using fine-tuned models if cost or scalability is a concern. ---- - ## Bottom Line Ultimately, you want to make sure your LLM outputs align with user expectations and deliver an enjoyable experience. A tight feedback loop with your users is key. @@ -284,6 +264,4 @@ There’s no hard limit on how many variations of a prompt you should test as pr primaryButtonLink="/signin" /> -## Questions or feedback? - -Are the information out of date? Please [raise an issue](https://github.com/Helicone/helicone/pulls) and we’d love to hear your insights! + diff --git a/bifrost/app/blog/blogs/vault-introduction/src.mdx b/bifrost/app/blog/blogs/vault-introduction/src.mdx index ae758baef3..8d11f50069 100644 --- a/bifrost/app/blog/blogs/vault-introduction/src.mdx +++ b/bifrost/app/blog/blogs/vault-introduction/src.mdx @@ -1,4 +1,3 @@ - ![Vault Feature](/static/blog/vault_banner.webp) In an evolving business landscape, managing multiple API keys can become a cumbersome process. Vault, Helicone's solution to this challenge, seamlessly integrates provider API keys like OpenAI and provides an efficient way to create and manage Helicone proxy keys. @@ -41,7 +40,7 @@ curl --request POST With Vault, the process is streamlined: -``` +``` curl --request POST \\ --url https://oai.helicone.ai/v1/chat/completions \\ --header 'Authorization: Bearer {HELICONE_PROXY_KEY}' \\ @@ -61,14 +60,15 @@ curl --request POST \\ Helicone takes security seriously. For provider keys, we use advanced AEAD encryption with transparent column-level encryption, ensuring these keys are safe even from database dumps. As for proxy keys, we employ a one-way hash, meaning once generated, they cannot be reverse-engineered. - ## Future Roadmap As we continue to refine and expand Vault, here's a glimpse into the future additions we're excited about: + - **Cost Rate Limits:** Set expenditure caps based on proxy key IDs to manage and control costs. - **Request Rate Limits:** Control the frequency of requests on a per proxy key ID basis, preventing any misuse or overuse. - ## Moving Forward with Vault As businesses grow and adapt, so do their needs. Helicone is committed to innovating and providing solutions like Vault to meet these demands. We believe in simplifying processes while maximizing security and efficiency. With Vault, we're one step closer to that vision. We're excited to see how Vault empowers your business and are always here to support your journey. + + diff --git a/bifrost/app/blog/blogs/weights-and-biases/src.mdx b/bifrost/app/blog/blogs/weights-and-biases/src.mdx index cf1e9e6605..5db890712a 100644 --- a/bifrost/app/blog/blogs/weights-and-biases/src.mdx +++ b/bifrost/app/blog/blogs/weights-and-biases/src.mdx @@ -1,27 +1,22 @@ - ![Helicone vs. Weights and Biases](/static/blog/weights-and-biases.webp) - ## The Observability Platform Designed for Modern LLMs -Weights and Biases (WandB) is an established machine learning platform offering a wide array of features for training models, such as high-level experiment tracking and visualization. +Weights and Biases (WandB) is an established machine learning platform offering a wide array of features for training models, such as high-level experiment tracking and visualization. -However, training modern LLMs is generally less complex than traditional ML models, so its extensive features can often make the platform feel overloaded and add unnecessary confusion. +However, training modern LLMs is generally less complex than traditional ML models, so its extensive features can often make the platform feel overloaded and add unnecessary confusion. **Helicone offers something better:** all the essential tools specifically designed for language model observability — **without the clutter**. - ## Compare -**Helicone is more cost-effective, user-friendly, and easy to integrate.** It empowers both your technical and non-technical teams to efficiently track and manage production metrics such as *latency, costs,* and *time to first tokens*. - +**Helicone is more cost-effective, user-friendly, and easy to integrate.** It empowers both your technical and non-technical teams to efficiently track and manage production metrics such as _latency, costs,_ and _time to first tokens_. Weights and Biases' primary focus is to offer an infrastructure for managing and scaling machine learning workflows. WandB concentrates on managing the end-to-end machine learning lifecycle, from data preparation and model development to deployment and monitoring. - ### Cost and Pricing Model -- **Helicone:** Offers *unlimited seats* and charges based on *usage*, making it a more cost-effective solution ideal for startups, growing teams, and organizations with fluctuating usage patterns. +- **Helicone:** Offers _unlimited seats_ and charges based on _usage_, making it a more cost-effective solution ideal for startups, growing teams, and organizations with fluctuating usage patterns. - **WandB:** Charges per seat, which can quickly add up for larger teams. ### Ease of Integration @@ -39,8 +34,6 @@ Weights and Biases' primary focus is to offer an infrastructure for managing and - **Helicone:** Excels in tracking production metrics such as cost and provides prompt tracking, making it particularly suited for monitoring modern LLMs. - **WandB:** Focuses more on classic ML tasks and offers better integrations for running evaluations, model versioning and giving users more control over their experiments. - - ## **Why Helicone Wins** ### The Developer Experience @@ -53,24 +46,37 @@ Weights and Biases provides detailed logging and tracing, which can be resource- ### Cost-effectiveness -Helicone operates on a volumetric pricing model, making it a more cost-effective option if you have high-volume usage. In addition, the first 100,000 requests per month are free. +Helicone operates on a volumetric pricing model, making it a more cost-effective option if you have high-volume usage. In addition, the first 100,000 requests per month are free. ### Reliability and Scalability Both Helicone and WandB are **open-source** and can **handle massive scale**. With Helicone, we integrated Kafka into our core data pipeline to ensure 100% log coverage, and use Cloudflare Workers to ensure sub-millisecond latency impact. - - - ## Helicone vs. Weights and Biases, which is worth the money? **Helicone is ideal if you want a simple way to track and manage production metrics for your LLM.** With the lightweight design, seamless integration with your preferred model provider, and cost-effective pricing, it's the perfect choice for developers working with LLMs. -Weights and Biases is ideal for developers who need deep insights into model performance, detailed experiment tracking, and fine-tuning capabilities. - - - -→ [Sign up](https://www.helicone.ai/pricing) for Helicone for free. - -→ We want to [hear about your use case](https://www.helicone.ai/contact)! - +Weights and Biases is ideal for developers who need deep insights into model performance, detailed experiment tracking, and fine-tuning capabilities. + + + +### You might be interested in + +- + Comparing Langsmith vs Helicone + +- + Comparing Braintrust vs Helicone + +- + Comparing Arize AI vs Helicone + + + diff --git a/bifrost/app/blog/blogs/when-to-finetune/src.mdx b/bifrost/app/blog/blogs/when-to-finetune/src.mdx index 465144df37..4b78770a96 100644 --- a/bifrost/app/blog/blogs/when-to-finetune/src.mdx +++ b/bifrost/app/blog/blogs/when-to-finetune/src.mdx @@ -1,10 +1,12 @@ As developers at the forefront of AI innovation, we're constantly exploring ways to optimize our applications. With the rise of large language models (LLMs) like GPT-4 and LLaMA, a question that often surfaces is: **"Should we fine-tune our models?"** -## **Rule of Thumb: DON'T FINE-TUNE** +![When to fine-tune](/static/blog/when-to-finetune/finetune-cover.webp) + +## Rule of Thumb: DON'T FINE-TUNE Fine-tuning might seem like the go-to solution for enhancing model performance, but it's not always the silver bullet it's made out to be. In fact, fine-tuning is only beneficial in a narrow set of scenarios, and diving into it without careful consideration can lead to more problems than solutions. -### **The Limitations of Fine-Tuning** +### The Limitations of Fine-Tuning 1. **Narrow Applicability**: Fine-tuning shines in well-defined, repeatable problems where the desired output is consistent and predictable. Outside of these cases, it can introduce unnecessary complexity. @@ -18,11 +20,11 @@ Fine-tuning might seem like the go-to solution for enhancing model performance, --- -## **When Fine-Tuning Makes Sense** +## When Fine-Tuning Makes Sense So, when _should_ you consider fine-tuning? Only in high-cost, high-accuracy use cases where the benefits clearly outweigh the drawbacks. -### **Ideal Scenarios for Fine-Tuning** +### Ideal Scenarios for Fine-Tuning - **Highly Specialized Tasks**: When dealing with extremely specific domains like legal contract analysis or medical diagnosis, where precision is paramount. @@ -34,7 +36,7 @@ _Think of fine-tuning as customizing a race car for a specific track. It perform --- -## **The Downsides of Fine-Tuning** +## The Downsides of Fine-Tuning ### 1. **Reduced Generalization** @@ -52,11 +54,11 @@ Fine-tuning requires significant computational power and storage, leading to hig --- -## **The Rising Power of Base Models** +## The Rising Power of Base Models One of the most compelling reasons to reconsider fine-tuning is the rapid advancement of base models. They're becoming faster, cheaper, and more powerful at an unprecedented rate. -### **Benefits of Sticking with Base Models** +### Benefits of Sticking with Base Models Sticking with base models offers several advantages. They maintain versatility, possessing a broad understanding that makes them adaptable to a wide range of tasks without additional training. They are cost-effective, reducing the need for expensive fine-tuning processes and infrastructure. Moreover, they provide future-proofing; as new models are released, you can immediately leverage their improved capabilities without the lag of retraining. @@ -64,47 +66,47 @@ _Using the latest smartphone right out of the box instead of customizing an olde --- -## **Alternatives to Fine-Tuning** +## Alternatives to Fine-Tuning Before jumping into fine-tuning, consider other strategies that can enhance your application's performance without the associated downsides. -### **Prompt Engineering** +### Prompt Engineering -Crafting better prompts can guide the model to produce more accurate and relevant outputs. This approach is cost-effective and doesn't require altering the model itself. +Crafting better prompts can guide the model to produce more accurate and relevant outputs. This approach is cost-effective and doesn't require altering the model itself. - **Example**: Instead of fine-tuning for customer service responses, develop prompts that guide the model to respond empathetically and professionally. -### **Few-Shot Learning** +### Few-Shot Learning -Providing the model with a few examples within the prompt can help it understand the desired output format or style. +Providing the model with a few examples within the prompt can help it understand the desired output format or style. - **Example**: Include sample inputs and desired outputs in your prompt to help the model generate code snippets in a specific programming language. -### **Utilizing Specialized APIs** +### Utilizing Specialized APIs Many providers offer specialized endpoints optimized for certain tasks. Leveraging these can save you the hassle of fine-tuning while still achieving high performance. - **Example**: Use OpenAI's GPT-4 Turbo with Vision API for image analysis and text generation tasks, or Anthropic's Claude 3 Opus for complex reasoning and analysis, instead of fine-tuning a general language model for these specific capabilities. -### **Retrieval-Augmented Generation (RAG)** +### Retrieval-Augmented Generation (RAG) RAG combines the power of large language models with external knowledge retrieval, allowing the model to access and utilize specific information without fine-tuning. -- **Example**: Instead of fine-tuning a model on your company's documentation, implement a RAG system that retrieves relevant information from your knowledge base and incorporates it into the model's responses. +- **Example**: Instead of fine-tuning a model on your company's documentation, build a RAG system that retrieves relevant information from your knowledge base and incorporates it into the model's responses. -### **Chain-of-Thought Prompting** +### Chain-of-Thought Prompting -This technique involves breaking down complex tasks into smaller, logical steps within the prompt, guiding the model through a reasoning process. +The chain-of-thought prompting technique involves breaking down complex tasks into smaller, logical steps within the prompt, guiding the model through a reasoning process. - **Example**: For solving math problems, provide a step-by-step breakdown in the prompt to guide the model's thought process, rather than fine-tuning it on mathematical reasoning. -### **Constrained Decoding** +### Constrained Decoding Use techniques like guided or controlled text generation to restrict the model's outputs without fine-tuning. This approach can be particularly effective for generating secure code. - **Example**: Implement custom decoding strategies to ensure the model generates code that adheres to specific security patterns or avoids known vulnerabilities. -[Recent research](https://arxiv.org/abs/2405.00218) has shown that constrained decoding can be more effective than techniques like prefix tuning for improving the security of code generation, without sacrificing functional correctness. Their work demonstrates that constrained decoding: +Recent research has shown that constrained decoding can be more effective than techniques like prefix tuning for improving the security of code generation, without sacrificing functional correctness. Their work demonstrates that constrained decoding: - Does not require a specialized training dataset - Can significantly improve the security of code generated by large language models @@ -112,13 +114,13 @@ Use techniques like guided or controlled text generation to restrict the model's This approach offers a promising direction for enhancing code security without the need for fine-tuning, making it a valuable alternative to consider in your AI development pipeline. -### **Ensemble Methods** +### Ensemble Methods Combine outputs from multiple models or API calls to improve accuracy and robustness without fine-tuning individual models. - **Example**: Use different models for various subtasks of a complex problem, then aggregate their outputs for a final result. For instance, use one model for sentiment analysis and another for entity recognition in a text analysis pipeline. -### **Mixture of Agents** +### Mixture of Agents Utilize multiple AI agents with different specializations or prompts to collaborate on complex tasks, simulating a team of experts. @@ -128,21 +130,21 @@ This approach differs from traditional ensemble methods by focusing on task divi --- -## **Making the Right Choice** +## Making the Right Choice Deciding whether to fine-tune should be a strategic decision based on a clear cost-benefit analysis. -### **Questions to Consider** +### Questions to Consider - **Is the task highly specialized and unmanageable with the base model?** - - **Are the performance gains worth the increased costs and maintenance?** - - **Will fine-tuning significantly impact the user experience or outcomes?** If the answer to these questions is a resounding yes, then fine-tuning might be the right path. Otherwise, exploring alternative methods is likely more beneficial. -## **Stay Ahead of the Curve** +--- + +## Stay Ahead of the Curve As base models continue to evolve, staying updated with the latest releases can offer substantial benefits without the overhead of fine-tuning. @@ -154,11 +156,11 @@ As base models continue to evolve, staying updated with the latest releases can By adopting a flexible and forward-thinking approach, you can ensure your AI applications remain competitive and effective in a rapidly changing landscape. ---- +### Further Resources -## **Further Resources** +- Helicone - Curate Datasets and Fine-tune Models +- Hugging Face - Fine-Tuning with Hugging Face +- OpenPipe - Fine-Tuning Best Practices: Training Data +- OpenPipe - Fine-Tuning Best Practices: Models -- **Helicone Fine-Tuning Guide**: [Fine-Tuning Models in Helicone](https://docs.helicone.ai/features/fine-tuning) -- **Hugging Face**: [Fine-Tuning with Hugging Face](https://huggingface.co/docs/transformers/main/en/training) -- **OpenPipe**: [Fine-Tuning Best Practices: Training Data](https://openpipe.ai/blog/fine-tuning-best-practices-series-introduction-and-chapter-1-training-data) -- **OpenPipe**: [Fine-Tuning Best Practices: Models](https://openpipe.ai/blog/fine-tuning-best-practices-chapter-2-models) + diff --git a/bifrost/components/blog/Questions.tsx b/bifrost/components/blog/Questions.tsx new file mode 100644 index 0000000000..6d904db392 --- /dev/null +++ b/bifrost/components/blog/Questions.tsx @@ -0,0 +1,35 @@ +import React from "react"; + +interface QuestionsProps {} + +export function Questions({}: QuestionsProps) { + return ( + <> +
+
+

+ Questions or feedback? +

+

+ Are the information out of date? Please{" "} + + raise an issue + {" "} + or{" "} + + contact us + + , we'd love to hear from you! +

+
+ + ); +} diff --git a/bifrost/mdx-components.tsx b/bifrost/mdx-components.tsx index a18d8af9ac..12f1efa89d 100644 --- a/bifrost/mdx-components.tsx +++ b/bifrost/mdx-components.tsx @@ -1,11 +1,13 @@ import type { MDXComponents } from "mdx/types"; -import { CallToAction } from "@/components/blog/CallToAction" -import { BottomLine } from "@/components/blog/BottomLine" +import { CallToAction } from "@/components/blog/CallToAction"; +import { BottomLine } from "@/components/blog/BottomLine"; +import { Questions } from "@/components/blog/Questions"; export function useMDXComponents(components: MDXComponents): MDXComponents { return { ...components, CallToAction: CallToAction, BottomLine: BottomLine, + Questions: Questions, }; } diff --git a/bifrost/public/static/blog/ai-agent-builders/pydantic-ai.webp b/bifrost/public/static/blog/ai-agent-builders/pydantic-ai.webp new file mode 100644 index 0000000000..5dce591781 Binary files /dev/null and b/bifrost/public/static/blog/ai-agent-builders/pydantic-ai.webp differ