From b4a2aa827554621e231c96a0086298402c2a907c Mon Sep 17 00:00:00 2001 From: Igor Drozdov Date: Tue, 15 Oct 2024 14:20:23 +0200 Subject: [PATCH] feat(evals): evaluate code suggestions --- .gitlab-ci.yml | 103 --------------------- .gitlab/ci/eval.gitlab-ci.yml | 21 +++-- eli5/cli/codesuggestions/evaluate.py | 7 ++ eli5/codesuggestions/clients/ai_gateway.py | 7 +- eli5/codesuggestions/evaluate.py | 4 +- 5 files changed, 27 insertions(+), 115 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index dc4895ad..b6af9d3b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,112 +7,9 @@ stages: - renovate_bot - datasets -include: - - template: Workflows/MergeRequest-Pipelines.gitlab-ci.yml - - # Upgrades dependencies on a schedule - # see https://gitlab.com/gitlab-com/gl-infra/common-ci-tasks/-/blob/main/renovate-bot.md - - project: "gitlab-com/gl-infra/common-ci-tasks" - ref: v2.41.1 # renovate:managed - file: renovate-bot.yml - -variables: - PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip" - POETRY_CACHE_DIR: "$CI_PROJECT_DIR/.cache/poetry" - GIT_LFS_SKIP_SMUDGE: 1 # Prevent Git LFS from automatically downloading large files - -cache: - key: - files: - - poetry.lock - - .gitlab-ci.yml - paths: - - $PIP_CACHE_DIR - - $POETRY_CACHE_DIR - - requirements.txt - -.poetry: - before_script: - - pip install poetry==1.8.3 - - poetry config virtualenvs.in-project true - - poetry config cache-dir ${POETRY_CACHE_DIR} - - poetry export -f requirements.txt --output requirements.txt --without-hashes - - poetry config --list - -############## -# Conditions # -############## -.if-merge-request: &if-merge-request - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' - -.if-default-branch: &if-default-branch - if: "$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH" - -.if-renovate-schedule: &if-renovate-schedule - if: '$CI_PIPELINE_SOURCE == "schedule" && $RENOVATE_SCHEDULED' - -.code-changes: &code-changes - - "**/*.{py}" - - ".gitlab-ci.yml" - - ".gitlab/ci/**/*" - - "Makefile" - - "poetry.lock" - -######### -# Rules # -######### -.rules:code-changes: - rules: - - <<: *if-renovate-schedule - when: never - - <<: *if-merge-request - changes: *code-changes - - <<: *if-default-branch - changes: *code-changes - -.rules:run-evaluations: - rules: - - <<: *if-default-branch - run_evaluations: - extends: - - .rules:run-evaluations stage: eval needs: [] trigger: include: - local: .gitlab/ci/eval.gitlab-ci.yml - -lint: - extends: - - .poetry - - .rules:code-changes - stage: lint - script: - - make lint - - poetry lock --no-update - - git diff --exit-code - after_script: - - | - # Hint for fixing issues - MAGENTA=$(printf '\e[35m') - BOLD=$(printf '\e[1m') - RESET=$(printf '\e[0m') - echo "${MAGENTA}Run ${BOLD}make format${RESET}${MAGENTA} to fix formatting issues.${RESET}" - -langsmith:pull: - extends: - - .poetry - stage: datasets - variables: - GITLAB_BASE_URL: https://gitlab.com - GITLAB_PRIVATE_TOKEN: $GITLAB_PRIVATE_TOKEN - GITLAB_TOKEN_NAME: langsmith-dataset-sync - needs: [] - when: manual - script: - - apt update && apt install git-lfs - - git lfs pull --include "datasets/synced/*" - - | - git remote set-url --push origin https://$GITLAB_TOKEN_NAME:$GITLAB_PRIVATE_TOKEN@gitlab.com/gitlab-org/ai-powered/eli5.git - - poetry install && poetry run eli5 datasets sync datasets/synced --create-mr diff --git a/.gitlab/ci/eval.gitlab-ci.yml b/.gitlab/ci/eval.gitlab-ci.yml index 9171440a..51cb516e 100644 --- a/.gitlab/ci/eval.gitlab-ci.yml +++ b/.gitlab/ci/eval.gitlab-ci.yml @@ -2,6 +2,7 @@ variables: GOOGLE_APPLICATION_CREDENTIALS: "/root/gcloud-service-key.json" GOOGLE_CREDENTIALS: $GOOGLE_CREDENTIALS GOOGLE_PROJECT: $GOOGLE_PROJECT + CI_DEBUG_SERVICES: "true" .setup-gcloud: &setup-gcloud - printf "%s" "$GOOGLE_CREDENTIALS" > "$GOOGLE_APPLICATION_CREDENTIALS" @@ -19,9 +20,7 @@ variables: services: - name: registry.gitlab.com/gitlab-org/modelops/applied-ml/code-suggestions/ai-assist/model-gateway:${AI_GATEWAY_TAG} alias: ai-gateway - - name: registry.gitlab.com/gitlab-org/ai-powered/custom-models/evaluations/gcloud-ollama - alias: gcloud-ollama - image: google/cloud-sdk:alpine + image: google/cloud-sdk:485.0.0-alpine needs: [] when: manual before_script: @@ -35,16 +34,20 @@ variables: code-suggestions: extends: - .setup + parallel: + matrix: + - MODEL_NAME: [mistral, codegemma, codellama, codestral] script: - echo "Running code ${CODE_SUGGESTIONS_INTENT}s evaluations" - poetry run eli5 code-suggestions evaluate - --dataset=$DATASET - --source=$SOURCE - --limit=$LIMIT - --experiment-prefix="code-suggestions-${CODE_SUGGESTIONS_INTENT}-${MODEL_NAME}" + --dataset=code_generation_mbpp_all_sanitized + --source=ai_gateway + --experiment-prefix="code-suggestions-${MODEL_NAME}" --model-name=$MODEL_NAME - --model-endpoint=$MODEL_ENDPOINT - --intent=$CODE_SUGGESTIONS_INTENT + --model-provider="ollama_chat/${MODEL_NAME}" + --model-endpoint=http://34.42.191.31 + --model-api-key=$MODEL_API_KEY + --intent=generation prompts: extends: diff --git a/eli5/cli/codesuggestions/evaluate.py b/eli5/cli/codesuggestions/evaluate.py index 9d8c78d2..e542dd06 100644 --- a/eli5/cli/codesuggestions/evaluate.py +++ b/eli5/cli/codesuggestions/evaluate.py @@ -95,6 +95,12 @@ def evaluate( help="Send model requests to a specific endpoint; this only applies if source=ai_gateway" ), ] = None, + model_api_key: Annotated[ + Optional[str], + typer.Option( + help="Send model requests to a specific api key; this only applies if source=ai_gateway" + ), + ] = None, ): client: Client = ctx.obj.langsmith_client @@ -114,5 +120,6 @@ def evaluate( model_name=model_name, model_provider=model_provider, model_endpoint=model_endpoint, + model_api_key=model_api_key, ) print(results) diff --git a/eli5/codesuggestions/clients/ai_gateway.py b/eli5/codesuggestions/clients/ai_gateway.py index 9d06599d..e17b466d 100644 --- a/eli5/codesuggestions/clients/ai_gateway.py +++ b/eli5/codesuggestions/clients/ai_gateway.py @@ -22,11 +22,14 @@ def ai_gateway_code_suggestion(file_name, content_above_cursor, content_below_cu "content_above_cursor": content_above_cursor, "content_below_cursor": content_below_cursor, }, + "prompt": "", "prompt_version": 2, - "prompt": None, - "model_provider": parameters.get("model_provider", "litellm"), + "prompt_id": "code_suggestions/generations", + "model_identifier": parameters.get("model_identifier", "litellm"), + "model_provider": "litellm", "model_endpoint": parameters.get("model_endpoint"), "model_name": parameters.get("model_name"), + "model_api_key": parameters.get("model_api_key"), } if intent == "generations": diff --git a/eli5/codesuggestions/evaluate.py b/eli5/codesuggestions/evaluate.py index f34b7420..fd5abf5a 100644 --- a/eli5/codesuggestions/evaluate.py +++ b/eli5/codesuggestions/evaluate.py @@ -76,6 +76,7 @@ def evaluate( model_name: Optional[str] = None, model_provider: Optional[str] = None, model_endpoint: Optional[str] = None, + model_api_key: Optional[str] = None, ): get_code_suggestion = CODE_SUGGESTION_PROVIDERS.get(code_suggestions_source) @@ -92,8 +93,9 @@ def evaluate( { "intent": intent, "model_name": model_name, - "model_provider": model_provider, + "model_identifier": model_provider, "model_endpoint": model_endpoint, + "model_api_key": model_api_key, } ) elif code_suggestions_source == "fireworks": -- GitLab