ParabolInc · mattkrick · Mar 29, 2024 · Mar 7, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/.env.example b/.env.example
@@ -14,7 +14,7 @@ SOCKET_PORT='3001'
 # AI MODELS
 AI_EMBEDDING_MODELS='[{"model": "text-embeddings-inference:llmrails/ember-v1", "url": "http://localhost:3040/"}]'
 AI_GENERATION_MODELS='[{"model": "text-generation-inference:TheBloke/zephyr-7b-beta", "url": "http://localhost:3050/"}]'
-AI_EMBEDDER_ENABLED='true'
+AI_EMBEDDER_WORKERS='1'
 
 # APPLICATION
 # AMPLITUDE_WRITE_KEY='key_AMPLITUDE_WRITE_KEY'

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -22,7 +22,7 @@ jobs:
       id-token: "write"
     services:
       postgres:
-        image: pgvector/pgvector:pg15
+        image: pgvector/pgvector:0.6.2-pg15
         # This env variables must be the same in the file PARABOL_BUILD_ENV_PATH
         env:
           POSTGRES_PASSWORD: "temppassword"
@@ -143,6 +143,6 @@ jobs:
         uses: ravsamhq/notify-slack-action@v2
         with:
           status: ${{ job.status }}
-          notify_when: 'failure'
+          notify_when: "failure"
         env:
           SLACK_WEBHOOK_URL: ${{ secrets.SLACK_GH_ACTIONS_NOTIFICATIONS }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -17,7 +17,7 @@ jobs:
       id-token: "write"
     services:
       postgres:
-        image: postgres:15.4
+        image: pgvector/pgvector:0.6.2-pg15
         # This env variables must be the same in the file PARABOL_BUILD_ENV_PATH
         env:
           POSTGRES_PASSWORD: "temppassword"
@@ -78,17 +78,13 @@ jobs:
           yarn db:migrate
           yarn pg:migrate up
           yarn pg:build
-          yarn pg:generate
 
       - name: Build for testing
         run: yarn build
 
       - name: Verify source is clean
         run: git diff --quiet HEAD || (echo "Changes in generated files detected"; git diff; exit 1)
 
-      - name: Check Code Quality
-        run: yarn codecheck
-
       - name: Run Predeploy for Testing
         run: yarn predeploy
 
@@ -100,6 +96,12 @@ jobs:
           wait-on: |
             http://localhost:3000/graphql
 
+      - name: Kysely Codegen
+        run: yarn pg:generate
+
+      - name: Check Code Quality
+        run: yarn codecheck
+
       - name: Run server tests
         run: yarn test:server -- --reporters=default --reporters=jest-junit
         env:
@@ -139,6 +141,6 @@ jobs:
         uses: ravsamhq/notify-slack-action@v2
         with:
           status: ${{ job.status }}
-          notify_when: 'failure'
+          notify_when: "failure"
         env:
           SLACK_WEBHOOK_URL: ${{ secrets.SLACK_GH_ACTIONS_NOTIFICATIONS }}
diff --git a/docker/images/parabol-ubi/README.md b/docker/images/parabol-ubi/README.md
@@ -16,21 +16,21 @@ Recommended:
 
 ## Variables
 
-| Name                 | Description                                                                                                             | Possible values                                       | Recommended value                                                   |
-| -------------------- | ----------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------- | ------------------------------------------------------------------- |
-| `postgresql_tag`     | PostgreSQL version from the [Docker image](https://hub.docker.com/_/postgres)                                           | `Any tag`                                             | `15.4`                                                              |
-| `rethinkdb_tag`      | RethinkDB version from the [Docker image](https://hub.docker.com/_/rethinkdb)                                           | `Any tag`                                             | `2.4.2`                                                             |
-| `redis_tag`          | Redis version from the [Docker image](https://hub.docker.com/_/redis)                                                   | `Any tag`                                             | `7.0-alpine`                                                        |
-| `_BUILD_ENV_PATH`    | File `.env` used by the application during the build process                                                            | `Relative path from the root level of the repository` | `docker/parabol-ubi/environments/basic-env`             |
-| `_NODE_VERSION`      | Node version, used by Docker to use the Docker image node:\_NODE_VERSION as base image to build                         | `Same as in root package.json`                        |                                                                     |
-| `_DOCKERFILE`        | Dockerfile used to build the image                                                                                      | `Relative path from the root level of the repository` | `./docker/parabol-ubi/dockerfiles/basic.dockerfile` |
-| `_DOCKER_REPOSITORY` | The destination repository                                                                                              | `String`                                              | `parabol`                                                           |
-| `_DOCKER_TAG`        | Tag for the produced image                                                                                              | `String`                                              |                                                                     |
+| Name                 | Description                                                                                     | Possible values                                       | Recommended value                                   |
+| -------------------- | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------- | --------------------------------------------------- |
+| `postgresql_tag`     | PostgreSQL version from the [Docker image](https://hub.docker.com/r/pgvector/pgvector)          | `Any tag`                                             | `0.6.2-pg15`                                        |
+| `rethinkdb_tag`      | RethinkDB version from the [Docker image](https://hub.docker.com/_/rethinkdb)                   | `Any tag`                                             | `2.4.2`                                             |
+| `redis_tag`          | Redis version from the [Docker image](https://hub.docker.com/_/redis)                           | `Any tag`                                             | `7.0-alpine`                                        |
+| `_BUILD_ENV_PATH`    | File `.env` used by the application during the build process                                    | `Relative path from the root level of the repository` | `docker/parabol-ubi/environments/basic-env`         |
+| `_NODE_VERSION`      | Node version, used by Docker to use the Docker image node:\_NODE_VERSION as base image to build | `Same as in root package.json`                        |                                                     |
+| `_DOCKERFILE`        | Dockerfile used to build the image                                                              | `Relative path from the root level of the repository` | `./docker/parabol-ubi/dockerfiles/basic.dockerfile` |
+| `_DOCKER_REPOSITORY` | The destination repository                                                                      | `String`                                              | `parabol`                                           |
+| `_DOCKER_TAG`        | Tag for the produced image                                                                      | `String`                                              |                                                     |
 
 Example of variables:
 
 ```commandLine
-export postgresql_tag=15.4; \
+export postgresql_tag=0.6.2-pg15; \
 export rethinkdb_tag=2.4.2; \
 export redis_tag=7.0-alpine; \
 export _BUILD_ENV_PATH=docker/parabol-ubi/environments/basic-env; \
@@ -61,7 +61,7 @@ cp $_BUILD_ENV_PATH ./.env
 > :warning: Stop all database containers you might have running before executing the following command. If other database containers are running, some ports might be already taken.
 
 ```commandLine
-docker run --name temp-postgres -e POSTGRES_PASSWORD=temppassword -e POSTGRES_USER=tempuser -e POSTGRES_DB=tempdb -d -p 5432:5432 postgres:$postgresql_tag && \
+docker run --name temp-postgres -e POSTGRES_PASSWORD=temppassword -e POSTGRES_USER=tempuser -e POSTGRES_DB=tempdb -d -p 5432:5432 pgvector/pgvector:$postgresql_tag && \
 docker run --name temp-rethinkdb -d -p 28015:28015 -p 29015:29015 -p 8080:8080 rethinkdb:$rethinkdb_tag && \
 docker run --name temp-redis -d -p 6379:6379 redis:$redis_tag
 ```

diff --git a/docker/images/parabol-ubi/environments/pipeline b/docker/images/parabol-ubi/environments/pipeline
@@ -54,3 +54,7 @@ STRIPE_PUBLISHABLE_KEY='pk_test_MNoKbCzQX0lhktuxxI7M14wd'
 STRIPE_SECRET_KEY=''
 STRIPE_WEBHOOK_SECRET=''
 HUBSPOT_API_KEY=''
+AI_EMBEDDING_MODELS='[{"model": "text-embeddings-inference:llmrails/ember-v1", "url": "http://localhost:3040/"}]'
+AI_GENERATION_MODELS='[{"model": "text-generation-inference:TheBloke/zephyr-7b-beta", "url": "http://localhost:3050/"}]'
+AI_EMBEDDER_WORKERS='1'
+POSTGRES_USE_PGVECTOR='true'
diff --git a/docker/stacks/development/docker-compose.yml b/docker/stacks/development/docker-compose.yml
@@ -70,7 +70,7 @@ services:
     networks:
       parabol-network:
   text-embeddings-inference:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-0.6
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2
     command:
       - "--model-id=llmrails/ember-v1"
     platform: linux/x86_64

diff --git a/...ks/single-tenant-host/docker-compose.yaml → ...cks/single-tenant-host/docker-compose.yml b/...ks/single-tenant-host/docker-compose.yaml → ...cks/single-tenant-host/docker-compose.yml
@@ -17,7 +17,7 @@ services:
   postgres:
     container_name: postgres
     profiles: ["databases"]
-    image: postgres:15.4
+    image: pgvector/pgvector:0.6.2-pg15
     restart: always
     env_file: .env
     environment:

diff --git a/package.json b/package.json
@@ -103,7 +103,7 @@
     "html-webpack-plugin": "^5.5.0",
     "husky": "^7.0.4",
     "jscodeshift": "^0.14.0",
-    "kysely": "^0.27.2",
+    "kysely": "^0.27.3",
     "kysely-codegen": "^0.11.0",
     "lerna": "^6.4.1",
     "mini-css-extract-plugin": "^2.7.2",

diff --git a/packages/client/shared/gqlIds/EmbedderChannelId.ts b/packages/client/shared/gqlIds/EmbedderChannelId.ts
@@ -0,0 +1,9 @@
+export const EmbedderChannelId = {
+  join: (serverId: string) => `embedder:${serverId}`,
+  split: (id: string) => {
+    const [, serverId] = id.split(':')
+    return serverId
+  }
+}
+
+export default EmbedderChannelId
diff --git a/packages/client/types/generics.ts b/packages/client/types/generics.ts
@@ -100,6 +100,9 @@ export type WithFieldsAsType<TObj, NType, F> = {
     : TObj[K]
 }
 
+export type Tuple<T, N, R extends T[] = []> = R['length'] extends N ? R : Tuple<T, N, [...R, T]>
+export type ParseInt<T extends string> = T extends `${infer Digit extends number}` ? Digit : never
+
 declare global {
   interface Array<T> {
     findLastIndex(predicate: (value: T, index: number, obj: T[]) => unknown, thisArg?: any): number

diff --git a/packages/embedder/EMBEDDER_JOB_PRIORITY.ts b/packages/embedder/EMBEDDER_JOB_PRIORITY.ts
@@ -0,0 +1,6 @@
+export const EMBEDDER_JOB_PRIORITY = {
+  MEETING: 40,
+  DEFAULT: 50,
+  TOPIC_HISTORY: 80,
+  NEW_MODEL: 90
+} as const
diff --git a/packages/embedder/EmbeddingsJobQueueStream.ts b/packages/embedder/EmbeddingsJobQueueStream.ts
@@ -0,0 +1,72 @@
+import {Selectable, sql} from 'kysely'
+import ms from 'ms'
+import sleep from 'parabol-client/utils/sleep'
+import 'parabol-server/initSentry'
+import getKysely from 'parabol-server/postgres/getKysely'
+import {DB} from 'parabol-server/postgres/pg'
+import RootDataLoader from '../server/dataloader/RootDataLoader'
+import {processJob} from './processJob'
+import {Logger} from '../server/utils/Logger'
+
+export type DBJob = Selectable<DB['EmbeddingsJobQueue']>
+export type EmbedJob = DBJob & {
+  jobType: 'embed'
+  jobData: {
+    embeddingsMetadataId: number
+    model: string
+  }
+}
+export type RerankJob = DBJob & {jobType: 'rerank'; jobData: {discussionIds: string[]}}
+export type Job = EmbedJob | RerankJob
+
+export class EmbeddingsJobQueueStream implements AsyncIterableIterator<Job> {
+  [Symbol.asyncIterator]() {
+    return this
+  }
+  dataLoader = new RootDataLoader({maxBatchSize: 1000})
+  async next(): Promise<IteratorResult<Job>> {
+    const pg = getKysely()
+    const getJob = (isFailed: boolean) => {
+      return pg
+        .with(
+          (cte) => cte('ids').materialized(),
+          (db) =>
+            db
+              .selectFrom('EmbeddingsJobQueue')
+              .select('id')
+              .orderBy(['priority'])
+              .$if(!isFailed, (db) => db.where('state', '=', 'queued'))
+              .$if(isFailed, (db) =>
+                db.where('state', '=', 'failed').where('retryAfter', '<', new Date())
+              )
+              .limit(1)
+              .forUpdate()
+              .skipLocked()
+        )
+        .updateTable('EmbeddingsJobQueue')
+        .set({state: 'running', startAt: new Date()})
+        .where('id', '=', sql<number>`ANY(SELECT id FROM ids)`)
+        .returningAll()
+        .executeTakeFirst()
+    }
+    const job = (await getJob(false)) || (await getJob(true))
+    if (!job) {
+      Logger.log('JobQueueStream: no jobs found')
+      // queue is empty, so sleep for a while
+      await sleep(ms('1m'))
+      return this.next()
+    }
+
+    const isSuccessful = await processJob(job as Job, this.dataLoader)
+    if (isSuccessful) {
+      await pg.deleteFrom('EmbeddingsJobQueue').where('id', '=', job.id).executeTakeFirstOrThrow()
+    }
+    return {done: false, value: job as Job}
+  }
+  return() {
+    return Promise.resolve({done: true as const, value: undefined})
+  }
+  throw(error: any) {
+    return Promise.resolve({done: true, value: error})
+  }
+}
diff --git a/packages/embedder/README.md b/packages/embedder/README.md
@@ -3,27 +3,14 @@
 This service builds embedding vectors for semantic search and for other AI/ML
 use cases. It does so by:
 
-1.  Updating a list of all possible items to create embedding vectors for and
-    storing that list in the `EmbeddingsMetadata` table
-2.  Adding these items in batches to the `EmbeddingsJobQueue` table and a redis
-    priority queue called `embedder:queue`
-3.  Allowing one or more parallel embedding services to calculate embedding
-    vectors (EmbeddingJobQueue states transistion from `queued` -> `embedding`,
-    then `embedding` -> [deleting the `EmbeddingJobQueue` row]
-
-    In addition to deleteing the `EmbeddingJobQueue` row, when a job completes
-    successfully:
-
-    - A row is added to the model table with the embedding vector; the
-      `EmbeddingMetadataId` field on this row points the appropriate
-      metadata row on `EmbeddingsMetadata`
-    - The `EmbeddingsMetadata.models` array is updated with the name of the
-      table that the embedding has been generated for
-
-4.  This process repeats forever using a silly polling loop
-
-In the future, it would be wonderful to enhance this service such that it were
-event driven.
+1. Homogenizes different types of data into a single `EmbeddingsMetadata` table
+2. Each new row in `EmbeddingsMetadata` creates a new row in `EmbeddingsJobQueue` for each model
+3. Uses PG to pick a job from the queue and sets the job from `queued` -> `embedding`,
+   then `embedding` -> [deleting the `EmbeddingJobQueue` row]
+4. Embedding involves creating a `fullText` from the work item and then a vector from that `fullText`
+5. New jobs to add metadata are sent via redis streams from the GQL Executor
+6. If embedding fails, the application increments the `retryCount` and increases the `retryAfter` if a retry is desired
+7. If a job gets stalled, a process that runs every 5 minutes will look for jobs older than 5 minutes and reset them to `queued`
 
 ## Prerequisites
 
@@ -37,10 +24,9 @@ The predeploy script checks for an environment variable
 The Embedder service takes no arguments and is controlled by the following
 environment variables, here given with example configuration:
 
-- `AI_EMBEDDER_ENABLE`: enable/disable the embedder service from
-  performing work, or sleeping indefinitely
+- `AI_EMBEDDER_WORKERS`: How many workers should simultaneously pick jobs from the queue. If less than 1, disabled.
 
-`AI_EMBEDDER_ENABLED='true'`
+`AI_EMBEDDER_WORKERS='1'`
 
 - `AI_EMBEDDING_MODELS`: JSON configuration for which embedding models
   are enabled. Each model in the array will be instantiated by
@@ -69,3 +55,10 @@ environment variables, here given with example configuration:
 The Embedder service is stateless and takes no arguments. Multiple instances
 of the service may be started in order to match embedding load, or to
 catch up on history more quickly.
+
+## Resources
+
+### PG as a Job Queue
+
+- https://leontrolski.github.io/postgres-as-queue.html
+- https://www.2ndquadrant.com/en/blog/what-is-select-skip-locked-for-in-postgresql-9-5/
diff --git a/packages/embedder/addEmbeddingsMetadata.ts b/packages/embedder/addEmbeddingsMetadata.ts
@@ -0,0 +1,15 @@
+import {addEmbeddingsMetadataForRetrospectiveDiscussionTopic} from './addEmbeddingsMetadataForRetrospectiveDiscussionTopic'
+import {MessageToEmbedder} from './custom'
+
+export const addEmbeddingsMetadata = async ({objectTypes, ...options}: MessageToEmbedder) => {
+  return Promise.all(
+    objectTypes.map((type) => {
+      switch (type) {
+        case 'retrospectiveDiscussionTopic':
+          return addEmbeddingsMetadataForRetrospectiveDiscussionTopic(options)
+        default:
+          throw new Error(`Invalid object type: ${type}`)
+      }
+    })
+  )
+}