From 8d6ba9b2113780b8f55f41c03a05c82543c3f389 Mon Sep 17 00:00:00 2001 From: Arturo Herrero Date: Tue, 23 Sep 2025 16:57:28 +0200 Subject: [PATCH 1/4] Active Context: Remove Chunking preprocessor --- doc/development/ai_features/glossary.md | 2 +- gems/gitlab-active-context/doc/usage.md | 35 ++++---- .../active_context/preprocessors/chunking.rb | 33 -------- .../lib/active_context/reference.rb | 1 - .../preprocessors/chunking_spec.rb | 81 ------------------- 5 files changed, 18 insertions(+), 134 deletions(-) delete mode 100644 gems/gitlab-active-context/lib/active_context/preprocessors/chunking.rb delete mode 100644 gems/gitlab-active-context/spec/lib/active_context/preprocessors/chunking_spec.rb diff --git a/doc/development/ai_features/glossary.md b/doc/development/ai_features/glossary.md index 9b61c9e9040a89..0fa186c3e8113e 100644 --- a/doc/development/ai_features/glossary.md +++ b/doc/development/ai_features/glossary.md @@ -263,7 +263,7 @@ context-aware code suggestions and generation. A [Ruby gem](https://gitlab.com/gitlab-org/gitlab/-/tree/master/gems/gitlab-active-context) that provides a unified interface for Retrieval Augmented Generation (RAG) across multiple vector databases within GitLab. The system abstracts away the differences between Elasticsearch, OpenSearch, and PostgreSQL with pgvector, enabling AI features to work regardless of the underlying storage solution. -Key components include collections that define data schemas and reference classes that handle serialization, migrations for schema management, and preprocessors for chunking and embedding generation. The layer supports automatic model migration between different LLMs without downtime, asynchronous processing through Redis-backed queues, and permission-aware search with automatic redaction. +Key components include collections that define data schemas and reference classes that handle serialization, migrations for schema management, and preprocessors for embedding generation. The layer supports automatic model migration between different LLMs without downtime, asynchronous processing through Redis-backed queues, and permission-aware search with automatic redaction. This [architecture](https://handbook.gitlab.com/handbook/engineering/architecture/design-documents/ai_context_abstraction_layer/) prevents vendor lock-in and enables GitLab customers without Elasticsearch to access RAG-powered features through pgvector. diff --git a/gems/gitlab-active-context/doc/usage.md b/gems/gitlab-active-context/doc/usage.md index b418dbba89381d..03248280e248b5 100644 --- a/gems/gitlab-active-context/doc/usage.md +++ b/gems/gitlab-active-context/doc/usage.md @@ -85,16 +85,21 @@ Instance methods required: - `identifier`: unique identifier Optional methods: + - `unique_identifiers`: array of identifiers to build a unique identifier for every document. For example, `[identifier, branch_name]`. Defaults to `[identifier]` ### Preprocessors -Existing preprocessors are +Existing preprocessors are: 1. `Preload`: preloads from the database to prevent N+1 queries -1. `Chunking`: splits content into chunks and assigns them to `ref.documents` +1. `ContentFetcher`: fetches content from existing documents in the vector store 1. `Embeddings`: generates embeddings for every document in bulk +> **Note:** Content chunking for code embeddings is now handled by the Go +Indexer during the indexing process, so chunking preprocessors are no longer +needed in Ruby reference classes. + #### Preload Requires `model_klass` and `model_klass` to define `preload_indexing_data`. @@ -105,22 +110,17 @@ add_preprocessor :preload do |refs| end ``` -#### Chunking +#### ContentFetcher -Requires passing `chunker` instance, `chunk_on` method to define the content to chunk on and the `field` to assign the content to. +Fetches content from existing documents in the vector store using a query. ```ruby -add_preprocessor :chunking do |refs| - chunker = Chunkers::BySize.new(chunk_size: 1000, overlap: 20) - chunk(refs: refs, chunker: chunker, chunk_on: :title_and_description, field: :content) -end +add_preprocessor :get_content do |refs| + identifiers = refs.map(&:identifier) + query = ActiveContext::Query.filter(id: identifiers).limit(identifiers.count) -def title_and_description - "Title: #{database_record.title}\n\nDescription: #{database_record.description}" + fetch_content(refs: refs, query: query, collection: Collections::Code) end -``` - -Chunkers use the `::ActiveContext::Concerns::Chunker` concern and should define a `chunks` method. The only existing chunker is `BySize`. #### Embeddings @@ -154,13 +154,13 @@ See [how to set initial embedding model](how_to.md#set-embedding-model) and [how Creates or updates documents, handling cases where a single reference has less documents than before by performing a delete cleanup operation. -The document content can be full or partial json. +The document content can be full or partial JSON. #### `update` Updates documents that already exist. -The document content can be full or partial json. +The document content can be full or partial JSON. #### `delete` @@ -242,9 +242,8 @@ module Ai module Context module References class CodeEmbeddings < ::ActiveContext::Reference - add_preprocessor :chunk_full_file_by_size do |refs| - chunker = Chunkers::BySize.new - chunk(refs: refs, chunker: chunker, chunk_on: :blob_content) + add_preprocessor :embeddings do |refs| + apply_embeddings(refs: refs, content_method: :blob_content) end attr_accessor :project_id, :identifier, :repository, :blob diff --git a/gems/gitlab-active-context/lib/active_context/preprocessors/chunking.rb b/gems/gitlab-active-context/lib/active_context/preprocessors/chunking.rb deleted file mode 100644 index f146421946237a..00000000000000 --- a/gems/gitlab-active-context/lib/active_context/preprocessors/chunking.rb +++ /dev/null @@ -1,33 +0,0 @@ -# frozen_string_literal: true - -module ActiveContext - module Preprocessors - module Chunking - extend ActiveSupport::Concern - - ChunkingError = Class.new(StandardError) - - class_methods do - def chunk(refs:, chunker:, chunk_on:, field:) - return { successful: [], failed: [] } if refs.empty? - - result = with_batch_handling(refs) do - raise ChunkingError, "Chunker must respond to :chunks method" unless chunker.respond_to?(:chunks) - - refs - end - - return result if result[:failed].any? - - with_per_ref_handling(refs) do |ref| - chunker.content = ref.send(chunk_on) # rubocop: disable GitlabSecurity/PublicSend -- method is defined elsewhere - - chunks = chunker.chunks - - ref.documents = chunks.map { |chunk| { "#{field}": chunk } } - end - end - end - end - end -end diff --git a/gems/gitlab-active-context/lib/active_context/reference.rb b/gems/gitlab-active-context/lib/active_context/reference.rb index 5920ba6ea96cc3..ee76787e9e333e 100644 --- a/gems/gitlab-active-context/lib/active_context/reference.rb +++ b/gems/gitlab-active-context/lib/active_context/reference.rb @@ -4,7 +4,6 @@ module ActiveContext class Reference extend Concerns::ReferenceUtils extend Concerns::Preprocessor - include Preprocessors::Chunking include Preprocessors::ContentFetcher include Preprocessors::Embeddings include Preprocessors::Preload diff --git a/gems/gitlab-active-context/spec/lib/active_context/preprocessors/chunking_spec.rb b/gems/gitlab-active-context/spec/lib/active_context/preprocessors/chunking_spec.rb deleted file mode 100644 index 52a35b637b837f..00000000000000 --- a/gems/gitlab-active-context/spec/lib/active_context/preprocessors/chunking_spec.rb +++ /dev/null @@ -1,81 +0,0 @@ -# frozen_string_literal: true - -RSpec.describe ActiveContext::Preprocessors::Chunking do - let(:reference_class) do - Class.new(Test::References::Mock) do - include ::ActiveContext::Preprocessors::Chunking - - add_preprocessor :chunk do |refs| - chunk(refs: refs, chunker: chunker, chunk_on: :foo, field: :some_content_field) - end - - def foo - 'Some content' - end - end - end - - let(:content_1) { "Test content for reference 1" } - let(:content_2) { "Test content for reference 2" } - let(:reference_1) { reference_class.new(collection_id: 1, routing: 1, args: 1) } - let(:reference_2) { reference_class.new(collection_id: 1, routing: 1, args: 1) } - let(:references) { [reference_1, reference_2] } - - let(:mock_collection) { double(name: collection_name, partition_for: partition, include_ref_fields: true) } - let(:mock_chunker) { double } - - let(:partition) { 2 } - let(:collection_id) { 1 } - let(:object_id) { 5 } - let(:collection_name) { 'mock_collection' } - - let(:chunks_1) { ['Chunk 1.1', 'Chunk 1.2'] } - let(:chunks_2) { ['Chunk 2.1'] } - - subject(:preprocess_refs) { ActiveContext::Reference.preprocess_references(references) } - - before do - allow(reference_class).to receive(:chunker).and_return(mock_chunker) - allow(mock_chunker).to receive(:content=) - allow(mock_chunker).to receive(:instance_variable_set) - allow(mock_chunker).to receive(:chunks).and_return(chunks_1, chunks_2) - allow(ActiveContext::CollectionCache).to receive(:fetch).and_return(mock_collection) - end - - it 'returns the references with documents populated' do - expect(reference_1).to receive(:foo).once - expect(reference_2).to receive(:foo).once - - result = preprocess_refs - - expect(result).to be_a(Hash) - expect(result).to have_key(:successful) - expect(result).to have_key(:failed) - expect(result[:successful]).to eq(references) - expect(result[:failed]).to be_empty - expect(result[:successful][0].documents).to eq([{ some_content_field: 'Chunk 1.1' }, - { some_content_field: 'Chunk 1.2' }]) - expect(result[:successful][1].documents).to eq([{ some_content_field: 'Chunk 2.1' }]) - end - - context 'when the chunker raises an error' do - let(:error) { StandardError.new('Chunking error') } - - before do - allow(mock_chunker).to receive(:chunks).and_raise(error) - allow(ActiveContext::Logger).to receive(:retryable_exception) - end - - it 'logs the error and returns failed references' do - expect(ActiveContext::Logger).to receive(:retryable_exception).with( - error, class: 'Class', reference: anything, reference_id: anything - ).twice - - result = preprocess_refs - - expect(result).to be_a(Hash) - expect(result[:successful]).to be_empty - expect(result[:failed]).to eq(references) - end - end -end -- GitLab From 744592f1f8f00b240de174d387e6e34bff4870b9 Mon Sep 17 00:00:00 2001 From: Arturo Herrero Date: Tue, 23 Sep 2025 17:12:07 +0200 Subject: [PATCH 2/4] Fix numbered list lint error --- gems/gitlab-active-context/doc/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gems/gitlab-active-context/doc/usage.md b/gems/gitlab-active-context/doc/usage.md index 03248280e248b5..fe7ec524c1e5bd 100644 --- a/gems/gitlab-active-context/doc/usage.md +++ b/gems/gitlab-active-context/doc/usage.md @@ -46,7 +46,7 @@ To create a new queue: end ``` -2. Make sure the queue is registered by adding it to the `queue_classes` configuration. +1. Make sure the queue is registered by adding it to the `queue_classes` configuration. ```ruby ActiveContext.configure do |config| -- GitLab From fa4312fb713f7376927f6133f94b0723ff627eae Mon Sep 17 00:00:00 2001 From: Arturo Herrero Date: Tue, 23 Sep 2025 17:44:37 +0200 Subject: [PATCH 3/4] Exclude gems documentation from Vale GitLab style rules --- .vale.ini | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.vale.ini b/.vale.ini index 0eb17152be78b2..5bbce807fcd044 100644 --- a/.vale.ini +++ b/.vale.ini @@ -12,3 +12,7 @@ BasedOnStyles = gitlab_base, gitlab_docs # Ignore SVG markup TokenIgnores = (\*\*\{\w*\}\*\*) + +# Exclude gem documentation from GitLab docs rules +[gems/**/*.md] +BasedOnStyles = -- GitLab From 9dee268ac590e9c53c15081d400fc30fdae723fd Mon Sep 17 00:00:00 2001 From: Arturo Herrero Date: Wed, 24 Sep 2025 08:10:41 +0200 Subject: [PATCH 4/4] Improve preprocessors note --- gems/gitlab-active-context/doc/usage.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/gems/gitlab-active-context/doc/usage.md b/gems/gitlab-active-context/doc/usage.md index fe7ec524c1e5bd..2b6eb5b406c218 100644 --- a/gems/gitlab-active-context/doc/usage.md +++ b/gems/gitlab-active-context/doc/usage.md @@ -96,9 +96,7 @@ Existing preprocessors are: 1. `ContentFetcher`: fetches content from existing documents in the vector store 1. `Embeddings`: generates embeddings for every document in bulk -> **Note:** Content chunking for code embeddings is now handled by the Go -Indexer during the indexing process, so chunking preprocessors are no longer -needed in Ruby reference classes. +These preprocessors rely on the document with content already stored in the vector store. If you need ActiveContext to handle the initial storage of documents in the vector store, you'll need to add a new preprocessor for that. #### Preload -- GitLab