diff --git a/.gitlab/ci/global.gitlab-ci.yml b/.gitlab/ci/global.gitlab-ci.yml index 02e13be9a7eb8181be38173c214ad323538c9ece..23f4aef866c99923ad6e7b1f26bccbba6df878a6 100644 --- a/.gitlab/ci/global.gitlab-ci.yml +++ b/.gitlab/ci/global.gitlab-ci.yml @@ -219,8 +219,9 @@ .use-pg12: services: - - name: postgres:12 + - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-12-pgvector-0.4.1 command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"] + alias: postgres - name: redis:6.0-alpine variables: POSTGRES_HOST_AUTH_METHOD: trust @@ -228,8 +229,9 @@ .use-pg13: services: - - name: postgres:13 + - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-13-pgvector-0.4.1 command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"] + alias: postgres - name: redis:6.2-alpine variables: POSTGRES_HOST_AUTH_METHOD: trust @@ -237,8 +239,9 @@ .use-pg14: services: - - name: postgres:14 + - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-14-pgvector-0.4.1 command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"] + alias: postgres - name: redis:6.2-alpine variables: POSTGRES_HOST_AUTH_METHOD: trust @@ -246,8 +249,9 @@ .use-pg12-es7-ee: services: - - name: postgres:12 + - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-12-pgvector-0.4.1 command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"] + alias: postgres - name: redis:6.0-alpine - name: elasticsearch:7.17.6 command: ["elasticsearch", "-E", "discovery.type=single-node", "-E", "xpack.security.enabled=false"] @@ -261,8 +265,9 @@ .use-pg13-es7-ee: services: - - name: postgres:13 + - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-13-pgvector-0.4.1 command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"] + alias: postgres - name: redis:6.2-alpine - name: elasticsearch:7.17.6 command: ["elasticsearch", "-E", "discovery.type=single-node", "-E", "xpack.security.enabled=false"] @@ -276,8 +281,9 @@ .use-pg14-es7-ee: services: - - name: postgres:14 + - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-14-pgvector-0.4.1 command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"] + alias: postgres - name: redis:6.2-alpine - name: elasticsearch:7.17.6 command: ["elasticsearch", "-E", "discovery.type=single-node", "-E", "xpack.security.enabled=false"] @@ -291,8 +297,9 @@ .use-pg13-es8-ee: services: - - name: postgres:13 + - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-13-pgvector-0.4.1 command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"] + alias: postgres - name: redis:6.0-alpine - name: elasticsearch:8.6.2 - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:zoekt-ci-image-1.0 @@ -307,8 +314,9 @@ .use-pg14-es8-ee: services: - - name: postgres:14 + - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-14-pgvector-0.4.1 command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"] + alias: postgres - name: redis:6.0-alpine - name: elasticsearch:8.6.2 - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:zoekt-ci-image-1.0 @@ -323,8 +331,9 @@ .use-pg13-opensearch1-ee: services: - - name: postgres:13 + - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-13-pgvector-0.4.1 command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"] + alias: postgres - name: redis:6.0-alpine - name: opensearchproject/opensearch:1.3.5 alias: elasticsearch @@ -339,8 +348,9 @@ .use-pg13-opensearch2-ee: services: - - name: postgres:13 + - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-13-pgvector-0.4.1 command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"] + alias: postgres - name: redis:6.0-alpine - name: opensearchproject/opensearch:2.2.1 alias: elasticsearch @@ -355,8 +365,9 @@ .use-pg14-opensearch1-ee: services: - - name: postgres:14 + - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-14-pgvector-0.4.1 command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"] + alias: postgres - name: redis:6.0-alpine - name: opensearchproject/opensearch:1.3.5 alias: elasticsearch @@ -371,8 +382,9 @@ .use-pg14-opensearch2-ee: services: - - name: postgres:14 + - name: ${REGISTRY_HOST}/${REGISTRY_GROUP}/gitlab-build-images:postgres-14-pgvector-0.4.1 command: ["postgres", "-c", "fsync=off", "-c", "synchronous_commit=off", "-c", "full_page_writes=off"] + alias: postgres - name: redis:6.0-alpine - name: opensearchproject/opensearch:2.2.1 alias: elasticsearch diff --git a/Gemfile b/Gemfile index df86744eee15e3f73c397823d924f3c94fef4449..920b0f03f950c0159ed8f411d252280fa0e8aad1 100644 --- a/Gemfile +++ b/Gemfile @@ -30,6 +30,8 @@ gem 'view_component', '~> 2.74.1' # Supported DBs gem 'pg', '~> 1.4.6' +gem 'neighbor', '~> 0.2.3' + gem 'rugged', '~> 1.5' gem 'grape-path-helpers', '~> 1.7.1' diff --git a/Gemfile.checksum b/Gemfile.checksum index 1458f2f442e2b29883f8c3f1b6a6dfbccefbce48..4cb6689ab4c9748c0143c3f0bea97616fb0d6374 100644 --- a/Gemfile.checksum +++ b/Gemfile.checksum @@ -373,6 +373,7 @@ {"name":"mustermann","version":"1.1.1","platform":"ruby","checksum":"0a21cfe505869cce9ce17998db5260344e78df81ae857c07a62143fd30299531"}, {"name":"mustermann-grape","version":"1.0.1","platform":"ruby","checksum":"00ce12b3df66be33ec4304aa9108fb9e1a0689f2a136c96b51c104684f5c5436"}, {"name":"nap","version":"1.1.0","platform":"ruby","checksum":"949691660f9d041d75be611bb2a8d2fd559c467537deac241f4097d9b5eea576"}, +{"name":"neighbor","version":"0.2.3","platform":"ruby","checksum":"70887ac2110d0c7ab243ee988f64359b8bb94a63a0c78542bbeef4f33b1933e5"}, {"name":"nenv","version":"0.3.0","platform":"ruby","checksum":"d9de6d8fb7072228463bf61843159419c969edb34b3cef51832b516ae7972765"}, {"name":"net-http-persistent","version":"4.0.1","platform":"ruby","checksum":"2752f4cce05fd1c45e0537c6f3a98fa5a4899efd5f88e63c104ed5f05cbddef9"}, {"name":"net-imap","version":"0.3.4","platform":"ruby","checksum":"a82a59e2a429433dc54cae5a8b2979ffe49da8c66085740811bfa337dc3729b5"}, diff --git a/Gemfile.lock b/Gemfile.lock index 2eeadd20d999e2dd2291862f565fc6f552b136d0..b0e74a31b0508af66ded1c47fc01289a47520a5d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -985,6 +985,8 @@ GEM mustermann-grape (1.0.1) mustermann (>= 1.0.0) nap (1.1.0) + neighbor (0.2.3) + activerecord (>= 5.2) nenv (0.3.0) net-http-persistent (4.0.1) connection_pool (~> 2.2) @@ -1828,6 +1830,7 @@ DEPENDENCIES mini_magick (~> 4.10.1) minitest (~> 5.11.0) multi_json (~> 1.14.1) + neighbor (~> 0.2.3) net-ldap (~> 0.17.1) net-ntp net-protocol (~> 0.1.3) diff --git a/config/database.yml.decomposed-postgresql b/config/database.yml.decomposed-postgresql index 3348c8cb27785be581f744d79e76fbe5c0e4fb2f..3b5d1ff2ed5623211a489c04b5a5a9da1627e23e 100644 --- a/config/database.yml.decomposed-postgresql +++ b/config/database.yml.decomposed-postgresql @@ -103,3 +103,10 @@ test: &test username: postgres password: host: localhost + embedding: + adapter: postgresql + encoding: unicode + database: gitlabhq_embedding_test + username: postgres + password: + host: localhost diff --git a/config/database.yml.postgresql b/config/database.yml.postgresql index c1b1247b5b086684962afaac515cb19b4c96af47..6d39418485505b20efa864c270323dd7a720bcec 100644 --- a/config/database.yml.postgresql +++ b/config/database.yml.postgresql @@ -124,3 +124,10 @@ test: &test username: postgres password: host: localhost + embedding: + adapter: postgresql + encoding: unicode + database: gitlabhq_embedding_test + username: postgres + password: + host: localhost diff --git a/ee/app/models/embedding/tanuki_bot_mvc.rb b/ee/app/models/embedding/tanuki_bot_mvc.rb new file mode 100644 index 0000000000000000000000000000000000000000..a5f0ebc3e25a33aefcfa38f601753f63d21824a2 --- /dev/null +++ b/ee/app/models/embedding/tanuki_bot_mvc.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +module Embedding + # This model should only store public content and embeddings + class TanukiBotMvc < Embedding::ApplicationRecord + self.table_name = 'tanuki_bot_mvc' + + has_neighbors :embedding + + scope :neighbor_for, ->(embedding) { nearest_neighbors(:embedding, embedding, distance: 'inner_product') } + end +end diff --git a/ee/db/embedding/docs/tanuki_bot_mvc.yml b/ee/db/embedding/docs/tanuki_bot_mvc.yml new file mode 100644 index 0000000000000000000000000000000000000000..8519d27e3325cdbcb081a999b5fe08006b763dbc --- /dev/null +++ b/ee/db/embedding/docs/tanuki_bot_mvc.yml @@ -0,0 +1,10 @@ +--- +table_name: tanuki_bot_mvc +classes: +- Embedding::TanukiBotMvc +feature_categories: +- global_search +description: +introduced_by_url: +milestone: '16.0' +gitlab_schema: gitlab_embedding diff --git a/ee/db/embedding/migrate/20230420103900_create_tanuki_bot_mvc.rb b/ee/db/embedding/migrate/20230420103900_create_tanuki_bot_mvc.rb new file mode 100644 index 0000000000000000000000000000000000000000..04bde97a5cf65130144d6cd91c02262d6029992b --- /dev/null +++ b/ee/db/embedding/migrate/20230420103900_create_tanuki_bot_mvc.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +class CreateTanukiBotMvc < Gitlab::Database::Migration[2.1] + enable_lock_retries! + + def up + create_table :tanuki_bot_mvc do |t| + t.timestamps_with_timezone null: false + t.vector :embedding, limit: 1536, null: false + t.text :url, null: false, limit: 2048 + t.text :content, null: false, limit: 32768 + t.jsonb :metadata, null: false + t.text :chroma_id, index: { unique: true }, limit: 512 + end + end + + def down + drop_table :tanuki_bot_mvc + end +end diff --git a/ee/db/embedding/schema_migrations/20230420103900 b/ee/db/embedding/schema_migrations/20230420103900 new file mode 100644 index 0000000000000000000000000000000000000000..e32dfff70e405d6522e6f521106b4a1065536417 --- /dev/null +++ b/ee/db/embedding/schema_migrations/20230420103900 @@ -0,0 +1 @@ +295782269f4738b6eb308f53144d7d4358affa39e7246a538d774200088a41d8 \ No newline at end of file diff --git a/ee/db/embedding/structure.sql b/ee/db/embedding/structure.sql index a115fbaf269a918ad16c268e022f4018d9f2c9ef..926f02505b9381a3f7c061f69d843da97b03af10 100644 --- a/ee/db/embedding/structure.sql +++ b/ee/db/embedding/structure.sql @@ -11,8 +11,38 @@ CREATE TABLE schema_migrations ( version character varying NOT NULL ); +CREATE TABLE tanuki_bot_mvc ( + id bigint NOT NULL, + created_at timestamp with time zone NOT NULL, + updated_at timestamp with time zone NOT NULL, + embedding vector(1536) NOT NULL, + url text NOT NULL, + content text NOT NULL, + metadata jsonb NOT NULL, + chroma_id text, + CONSTRAINT check_5df597f0fb CHECK ((char_length(url) <= 2048)), + CONSTRAINT check_67053ce605 CHECK ((char_length(content) <= 32768)), + CONSTRAINT check_e130e042d4 CHECK ((char_length(chroma_id) <= 512)) +); + +CREATE SEQUENCE tanuki_bot_mvc_id_seq + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1; + +ALTER SEQUENCE tanuki_bot_mvc_id_seq OWNED BY tanuki_bot_mvc.id; + +ALTER TABLE ONLY tanuki_bot_mvc ALTER COLUMN id SET DEFAULT nextval('tanuki_bot_mvc_id_seq'::regclass); + ALTER TABLE ONLY ar_internal_metadata ADD CONSTRAINT ar_internal_metadata_pkey PRIMARY KEY (key); ALTER TABLE ONLY schema_migrations ADD CONSTRAINT schema_migrations_pkey PRIMARY KEY (version); + +ALTER TABLE ONLY tanuki_bot_mvc + ADD CONSTRAINT tanuki_bot_mvc_pkey PRIMARY KEY (id); + +CREATE UNIQUE INDEX index_tanuki_bot_mvc_on_chroma_id ON tanuki_bot_mvc USING btree (chroma_id); diff --git a/ee/spec/factories/embedding/tanuki_bot.rb b/ee/spec/factories/embedding/tanuki_bot.rb new file mode 100644 index 0000000000000000000000000000000000000000..0f3fe6a3d83e2061d6bf5f959e54183183aaf360 --- /dev/null +++ b/ee/spec/factories/embedding/tanuki_bot.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +FactoryBot.define do + factory :tanuki_bot_mvc, class: 'Embedding::TanukiBotMvc' do + url { 'http://example.com/path/to/a/doc' } + + metadata do + { + info: 'A description', + source: 'path/to/a/doc.md', + source_type: 'doc' + } + end + + content { 'Some text' } + embedding { Array.new(1536, 0.3) } + end +end diff --git a/ee/spec/models/embedding/tanuki_bot_mvc_spec.rb b/ee/spec/models/embedding/tanuki_bot_mvc_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..4c98913f45740e7c0b15ab554ed291f410791691 --- /dev/null +++ b/ee/spec/models/embedding/tanuki_bot_mvc_spec.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Embedding::TanukiBotMvc, type: :model, feature_category: :global_search do + describe 'scopes' do + describe '.neighbor_for' do + let_it_be(:question) { build(:tanuki_bot_mvc) } + + it 'calls nearest_neighbors for question' do + create_list(:tanuki_bot_mvc, 2) + + expect(described_class).to receive(:nearest_neighbors) + .with(:embedding, question.embedding, distance: 'inner_product').once + + described_class.neighbor_for(question.embedding) + end + + context 'with a far away embedding' do + let_it_be(:far_embedding) { create(:tanuki_bot_mvc, embedding: Array.new(1536, -0.999)) } + let_it_be(:close_embedding) { create(:tanuki_bot_mvc, embedding: Array.new(1536, 0.333)) } + + it 'does not return the far neighbor' do + expect(described_class.neighbor_for(question.embedding).limit(1)).to match_array(close_embedding) + end + end + end + end +end diff --git a/scripts/prepare_build.sh b/scripts/prepare_build.sh index ca3dd0eec5724f7fa4518b3efb047d7fb0b0b9f9..924c430d054d3af1502031e523ff5d6c13f14fd9 100644 --- a/scripts/prepare_build.sh +++ b/scripts/prepare_build.sh @@ -32,6 +32,15 @@ else sed -i '/geo:/,/^$/d' config/database.yml fi +# Set up Embedding database if the job name matches `rspec-ee` +# Since Embedding is an EE feature, we shouldn't set it up for non-EE tests. +if [[ "${CI_JOB_NAME}" =~ "rspec-ee" ]]; then + echoinfo "Embedding DB will be set up." +else + echoinfo "Embedding DB won't be set up." + sed -i '/embedding:/,/^$/d' config/database.yml +fi + # Set user to a non-superuser to ensure we test permissions sed -i 's/username: root/username: gitlab/g' config/database.yml