From dbcec49a9e58d36498229114106f3f778d91d3dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20Trzci=C5=84ski?= Date: Tue, 28 Jan 2020 22:07:24 +0100 Subject: [PATCH 1/2] Implement `ndjson` support for `import/export` This implements `ndjson` and streaming `json` support to handle two cases: - big `project.json` (legacy way) - new `.ndjson` format, where each relation receives a separate file, and each item is stored per-line This can properly detect old and a new file contents, without any changes to the files, and by maintaining backward compatibility. This implements a trick to support streaming json writer to append data additively. This overall when `exporting legacy/ndjson` or `importing ndjson` allows us to have a constant memory for the process, and also significantly reduces latency of the data processing due to not escaping to the native. --- db/schema.rb | 110 +++++++++--------- .../import_export/json/dedup_legacy_reader.rb | 79 +++++++++++++ .../import_export/json/legacy_reader.rb | 48 ++++++++ .../import_export/json/legacy_writer.rb | 71 +++++++++++ .../import_export/json/ndjson_reader.rb | 47 ++++++++ .../import_export/json/ndjson_writer.rb | 48 ++++++++ .../json/streaming_serializer.rb | 106 +++++++++++++++++ .../import_export/project_tree_loader.rb | 72 ------------ .../import_export/project_tree_restorer.rb | 40 +++---- .../import_export/project_tree_saver.rb | 38 +++--- .../import_export/relation_tree_restorer.rb | 23 +--- 11 files changed, 487 insertions(+), 195 deletions(-) create mode 100644 lib/gitlab/import_export/json/dedup_legacy_reader.rb create mode 100644 lib/gitlab/import_export/json/legacy_reader.rb create mode 100644 lib/gitlab/import_export/json/legacy_writer.rb create mode 100644 lib/gitlab/import_export/json/ndjson_reader.rb create mode 100644 lib/gitlab/import_export/json/ndjson_writer.rb create mode 100644 lib/gitlab/import_export/json/streaming_serializer.rb delete mode 100644 lib/gitlab/import_export/project_tree_loader.rb diff --git a/db/schema.rb b/db/schema.rb index 80e7af66fb9934..6a48dd69467e89 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -103,15 +103,6 @@ t.index ["project_id", "committed_date", "analytics_repository_file_id"], name: "index_file_commits_on_committed_date_file_id_and_project_id", unique: true end - create_table "analytics_repository_file_edits", force: :cascade do |t| - t.bigint "project_id", null: false - t.bigint "analytics_repository_file_id", null: false - t.date "committed_date", null: false - t.integer "num_edits", default: 0, null: false - t.index ["analytics_repository_file_id", "committed_date", "project_id"], name: "index_file_edits_on_committed_date_file_id_and_project_id", unique: true - t.index ["project_id"], name: "index_analytics_repository_file_edits_on_project_id" - end - create_table "analytics_repository_files", force: :cascade do |t| t.bigint "project_id", null: false t.string "file_path", limit: 4096, null: false @@ -281,7 +272,6 @@ t.boolean "pseudonymizer_enabled", default: false, null: false t.boolean "hide_third_party_offers", default: false, null: false t.boolean "snowplow_enabled", default: false, null: false - t.string "snowplow_collector_hostname" t.string "snowplow_cookie_domain" t.boolean "instance_statistics_visibility_private", default: false, null: false t.boolean "web_ide_clientside_preview_enabled", default: false, null: false @@ -298,34 +288,34 @@ t.integer "first_day_of_week", default: 0, null: false t.boolean "elasticsearch_limit_indexing", default: false, null: false t.integer "default_project_creation", default: 2, null: false + t.string "geo_node_allowed_ips", default: "0.0.0.0/0, ::/0" t.string "lets_encrypt_notification_email" t.boolean "lets_encrypt_terms_of_service_accepted", default: false, null: false - t.string "geo_node_allowed_ips", default: "0.0.0.0/0, ::/0" t.integer "elasticsearch_shards", default: 5, null: false t.integer "elasticsearch_replicas", default: 1, null: false t.text "encrypted_lets_encrypt_private_key" t.text "encrypted_lets_encrypt_private_key_iv" - t.string "required_instance_ci_template" t.boolean "dns_rebinding_protection_enabled", default: true, null: false - t.boolean "default_project_deletion_protection", default: false, null: false - t.boolean "grafana_enabled", default: false, null: false t.boolean "lock_memberships_to_ldap", default: false, null: false + t.boolean "default_project_deletion_protection", default: false, null: false + t.string "required_instance_ci_template" t.boolean "time_tracking_limit_to_hours", default: false, null: false + t.boolean "grafana_enabled", default: false, null: false t.string "grafana_url", default: "/-/grafana", null: false - t.boolean "login_recaptcha_protection_enabled", default: false, null: false t.string "outbound_local_requests_whitelist", limit: 255, default: [], null: false, array: true t.integer "raw_blob_request_limit", default: 300, null: false + t.bigint "instance_administration_project_id" t.boolean "allow_local_requests_from_web_hooks_and_services", default: false, null: false t.boolean "allow_local_requests_from_system_hooks", default: true, null: false - t.bigint "instance_administration_project_id" + t.string "snowplow_collector_hostname" t.boolean "asset_proxy_enabled", default: false, null: false t.string "asset_proxy_url" t.text "asset_proxy_whitelist" t.text "encrypted_asset_proxy_secret_key" t.string "encrypted_asset_proxy_secret_key_iv" + t.boolean "login_recaptcha_protection_enabled", default: false, null: false t.string "static_objects_external_storage_url", limit: 255 t.string "static_objects_external_storage_auth_token", limit: 255 - t.integer "max_personal_access_token_lifetime" t.boolean "throttle_protected_paths_enabled", default: false, null: false t.integer "throttle_protected_paths_requests_per_period", default: 10, null: false t.integer "throttle_protected_paths_period_in_seconds", default: 60, null: false @@ -333,25 +323,23 @@ t.boolean "throttle_incident_management_notification_enabled", default: false, null: false t.integer "throttle_incident_management_notification_period_in_seconds", default: 3600 t.integer "throttle_incident_management_notification_per_period", default: 3600 - t.string "snowplow_iglu_registry_url", limit: 255 t.integer "push_event_hooks_limit", default: 3, null: false t.integer "push_event_activities_limit", default: 3, null: false t.string "custom_http_clone_url_root", limit: 511 t.integer "deletion_adjourned_period", default: 7, null: false - t.date "license_trial_ends_on" + t.string "snowplow_iglu_registry_url", limit: 255 + t.datetime_with_timezone "productivity_analytics_start_date" + t.string "snowplow_app_id" t.boolean "eks_integration_enabled", default: false, null: false t.string "eks_account_id", limit: 128 t.string "eks_access_key_id", limit: 128 t.string "encrypted_eks_secret_access_key_iv", limit: 255 t.text "encrypted_eks_secret_access_key" - t.string "snowplow_app_id" - t.datetime_with_timezone "productivity_analytics_start_date" + t.date "license_trial_ends_on" t.string "default_ci_config_path", limit: 255 t.boolean "sourcegraph_enabled", default: false, null: false t.string "sourcegraph_url", limit: 255 t.boolean "sourcegraph_public_only", default: true, null: false - t.bigint "snippet_size_limit", default: 52428800, null: false - t.integer "minimum_password_length", default: 8, null: false t.text "encrypted_akismet_api_key" t.string "encrypted_akismet_api_key_iv", limit: 255 t.text "encrypted_elasticsearch_aws_secret_access_key" @@ -364,9 +352,12 @@ t.string "encrypted_slack_app_secret_iv", limit: 255 t.text "encrypted_slack_app_verification_token" t.string "encrypted_slack_app_verification_token_iv", limit: 255 - t.boolean "force_pages_access_control", default: false, null: false + t.bigint "snippet_size_limit", default: 52428800, null: false + t.integer "max_personal_access_token_lifetime" + t.integer "minimum_password_length", default: 8, null: false t.boolean "updating_name_disabled_for_users", default: false, null: false t.integer "instance_administrators_group_id" + t.boolean "force_pages_access_control", default: false, null: false t.index ["custom_project_templates_group_id"], name: "index_application_settings_on_custom_project_templates_group_id" t.index ["file_template_project_id"], name: "index_application_settings_on_file_template_project_id" t.index ["instance_administration_project_id"], name: "index_applicationsettings_on_instance_administration_project_id" @@ -392,9 +383,9 @@ t.integer "report_type", limit: 2 t.index ["merge_request_id", "code_owner", "name"], name: "approval_rule_name_index_for_code_owners", unique: true, where: "(code_owner = true)" t.index ["merge_request_id", "code_owner"], name: "index_approval_merge_request_rules_1" - t.index ["merge_request_id", "name"], name: "index_approval_rule_name_for_code_owners_rule_type", unique: true, where: "(rule_type = 2)" + t.index ["merge_request_id", "rule_type", "name"], name: "index_approval_rule_name_for_code_owners_rule_type", unique: true, where: "(rule_type = 2)" t.index ["merge_request_id", "rule_type"], name: "any_approver_merge_request_rule_type_unique_index", unique: true, where: "(rule_type = 4)" - t.index ["merge_request_id"], name: "index_approval_rules_code_owners_rule_type", where: "(rule_type = 2)" + t.index ["merge_request_id", "rule_type"], name: "index_approval_rules_code_owners_rule_type", where: "(rule_type = 2)" end create_table "approval_merge_request_rules_approved_approvers", force: :cascade do |t| @@ -518,9 +509,9 @@ t.integer "project_id" t.integer "group_id" t.string "type", null: false - t.string "name", limit: 255 t.datetime_with_timezone "created_at", null: false t.datetime_with_timezone "updated_at", null: false + t.string "name", limit: 255 t.index ["group_id"], name: "index_badges_on_group_id" t.index ["project_id"], name: "index_badges_on_project_id" end @@ -614,9 +605,9 @@ t.index ["namespace_id"], name: "index_chat_teams_on_namespace_id", unique: true end - create_table "ci_build_needs", id: :serial, force: :cascade do |t| + create_table "ci_build_needs", force: :cascade do |t| t.integer "build_id", null: false - t.text "name", null: false + t.string "name", null: false t.boolean "artifacts", default: true, null: false t.index ["build_id", "name"], name: "index_ci_build_needs_on_build_id_and_name", unique: true end @@ -695,9 +686,10 @@ t.datetime_with_timezone "scheduled_at" t.string "token_encrypted" t.integer "upstream_pipeline_id" + t.integer "scheduler_priority", limit: 2 + t.boolean "processed" t.bigint "resource_group_id" t.datetime_with_timezone "waiting_for_resource_at" - t.boolean "processed" t.index ["artifacts_expire_at"], name: "index_ci_builds_on_artifacts_expire_at", where: "(artifacts_file <> ''::text)" t.index ["auto_canceled_by_id"], name: "index_ci_builds_on_auto_canceled_by_id" t.index ["commit_id", "artifacts_expire_at", "id"], name: "index_ci_builds_on_commit_id_and_artifacts_expireatandidpartial", where: "(((type)::text = 'Ci::Build'::text) AND ((retried = false) OR (retried IS NULL)) AND ((name)::text = ANY (ARRAY[('sast'::character varying)::text, ('dependency_scanning'::character varying)::text, ('sast:container'::character varying)::text, ('container_scanning'::character varying)::text, ('dast'::character varying)::text])))" @@ -731,9 +723,9 @@ t.integer "project_id", null: false t.integer "timeout" t.integer "timeout_source", default: 1, null: false - t.boolean "interruptible" t.jsonb "config_options" t.jsonb "config_variables" + t.boolean "interruptible" t.boolean "has_exposed_artifacts" t.string "environment_auto_stop_in", limit: 255 t.index ["build_id"], name: "index_ci_builds_metadata_on_build_id", unique: true @@ -1133,7 +1125,7 @@ t.index ["cluster_id"], name: "index_clusters_applications_cert_managers_on_cluster_id", unique: true end - create_table "clusters_applications_crossplane", id: :serial, force: :cascade do |t| + create_table "clusters_applications_crossplane", force: :cascade do |t| t.datetime_with_timezone "created_at", null: false t.datetime_with_timezone "updated_at", null: false t.bigint "cluster_id", null: false @@ -2417,7 +2409,7 @@ t.index ["user_id"], name: "index_members_on_user_id" end - create_table "merge_request_assignees", force: :cascade do |t| + create_table "merge_request_assignees", id: :serial, force: :cascade do |t| t.integer "user_id", null: false t.integer "merge_request_id", null: false t.index ["merge_request_id", "user_id"], name: "index_merge_request_assignees_on_merge_request_id_and_user_id", unique: true @@ -2748,6 +2740,8 @@ t.integer "max_pages_size" t.integer "max_artifacts_size" t.boolean "mentions_disabled" + t.index "lower((name)::text)", name: "index_on_namespaces_lower_name" + t.index "lower((path)::text)", name: "index_on_namespaces_lower_path" t.index ["created_at"], name: "index_namespaces_on_created_at" t.index ["custom_project_templates_group_id", "type"], name: "index_namespaces_on_custom_project_templates_group_id_and_type", where: "(custom_project_templates_group_id IS NOT NULL)" t.index ["file_template_project_id"], name: "index_namespaces_on_file_template_project_id" @@ -2936,10 +2930,8 @@ t.bigint "package_file_id", null: false t.datetime_with_timezone "created_at", null: false t.datetime_with_timezone "updated_at", null: false - t.string "recipe_revision", limit: 255, default: "0", null: false - t.string "package_revision", limit: 255 - t.string "conan_package_reference", limit: 255 - t.integer "conan_file_type", limit: 2, null: false + t.string "path", limit: 255, null: false + t.string "revision", limit: 255, default: "0", null: false t.index ["package_file_id"], name: "index_packages_conan_file_metadata_on_package_file_id", unique: true end @@ -3231,7 +3223,7 @@ t.index ["project_id"], name: "index_project_import_data_on_project_id" end - create_table "project_incident_management_settings", primary_key: "project_id", id: :serial, force: :cascade do |t| + create_table "project_incident_management_settings", primary_key: "project_id", id: :integer, default: nil, force: :cascade do |t| t.boolean "create_issue", default: true, null: false t.boolean "send_email", default: false, null: false t.text "issue_template_key" @@ -3402,6 +3394,7 @@ t.boolean "autoclose_referenced_issues" t.string "suggestion_commit_message", limit: 255 t.index "lower((name)::text)", name: "index_projects_on_lower_name" + t.index "lower((path)::text)", name: "index_on_projects_lower_path" t.index ["created_at", "id"], name: "index_projects_api_created_at_id_desc", order: { id: :desc } t.index ["created_at", "id"], name: "index_projects_api_vis20_created_at", where: "(visibility_level = 20)" t.index ["created_at", "id"], name: "index_projects_api_vis20_created_at_id_desc", order: { id: :desc }, where: "(visibility_level = 20)" @@ -3618,7 +3611,9 @@ t.datetime "created_at", null: false t.datetime "updated_at", null: false t.index "lower((path)::text) varchar_pattern_ops", name: "index_redirect_routes_on_path_unique_text_pattern_ops", unique: true + t.index "lower((path)::text)", name: "index_on_redirect_routes_lower_path" t.index ["path"], name: "index_redirect_routes_on_path", unique: true + t.index ["path"], name: "index_redirect_routes_on_path_text_pattern_ops", opclass: :varchar_pattern_ops t.index ["source_type", "source_id"], name: "index_redirect_routes_on_source_type_and_source_id" end @@ -3720,6 +3715,7 @@ t.datetime "created_at" t.datetime "updated_at" t.string "name" + t.index "lower((path)::text)", name: "index_on_routes_lower_path" t.index ["path"], name: "index_routes_on_path", unique: true t.index ["path"], name: "index_routes_on_path_text_pattern_ops", opclass: :varchar_pattern_ops t.index ["source_type", "source_id"], name: "index_routes_on_source_type_and_source_id", unique: true @@ -4114,8 +4110,8 @@ t.boolean "time_format_in_24h" t.string "projects_sort", limit: 64 t.boolean "show_whitespace_in_diffs", default: true, null: false - t.boolean "sourcegraph_enabled" t.boolean "setup_for_company" + t.boolean "sourcegraph_enabled" t.boolean "render_whitespace_in_code" t.index ["user_id"], name: "index_user_preferences_on_user_id", unique: true end @@ -4221,7 +4217,9 @@ t.string "last_name", limit: 255 t.string "static_object_token", limit: 255 t.integer "role", limit: 2 + t.index "lower((email)::text)", name: "index_on_users_lower_email" t.index "lower((name)::text)", name: "index_on_users_name_lower" + t.index "lower((username)::text)", name: "index_on_users_lower_username" t.index ["accepted_term_id"], name: "index_users_on_accepted_term_id" t.index ["admin"], name: "index_users_on_admin" t.index ["bot_type"], name: "index_users_on_bot_type" @@ -4279,28 +4277,28 @@ t.bigint "author_id", null: false t.bigint "updated_by_id" t.bigint "last_edited_by_id" - t.date "start_date" - t.date "due_date" - t.datetime_with_timezone "last_edited_at" - t.datetime_with_timezone "created_at", null: false - t.datetime_with_timezone "updated_at", null: false - t.string "title", limit: 255, null: false - t.text "title_html" - t.text "description" - t.text "description_html" t.bigint "start_date_sourcing_milestone_id" t.bigint "due_date_sourcing_milestone_id" t.bigint "closed_by_id" + t.datetime_with_timezone "last_edited_at" + t.datetime_with_timezone "created_at", null: false + t.datetime_with_timezone "updated_at", null: false t.datetime_with_timezone "closed_at" + t.date "start_date" + t.date "due_date" t.integer "state", limit: 2, default: 1, null: false t.integer "severity", limit: 2, null: false - t.boolean "severity_overridden", default: false t.integer "confidence", limit: 2, null: false + t.boolean "severity_overridden", default: false t.boolean "confidence_overridden", default: false - t.bigint "resolved_by_id" - t.datetime_with_timezone "resolved_at" + t.string "title", limit: 255, null: false + t.text "title_html" + t.text "description" + t.text "description_html" t.integer "report_type", limit: 2, null: false t.integer "cached_markdown_version" + t.bigint "resolved_by_id" + t.datetime_with_timezone "resolved_at" t.index ["author_id"], name: "index_vulnerabilities_on_author_id" t.index ["closed_by_id"], name: "index_vulnerabilities_on_closed_by_id" t.index ["due_date_sourcing_milestone_id"], name: "index_vulnerabilities_on_due_date_sourcing_milestone_id" @@ -4478,8 +4476,6 @@ add_foreign_key "analytics_language_trend_repository_languages", "projects", on_delete: :cascade add_foreign_key "analytics_repository_file_commits", "analytics_repository_files", on_delete: :cascade add_foreign_key "analytics_repository_file_commits", "projects", on_delete: :cascade - add_foreign_key "analytics_repository_file_edits", "analytics_repository_files", on_delete: :cascade - add_foreign_key "analytics_repository_file_edits", "projects", on_delete: :cascade add_foreign_key "analytics_repository_files", "projects", on_delete: :cascade add_foreign_key "application_settings", "namespaces", column: "custom_project_templates_group_id", on_delete: :nullify add_foreign_key "application_settings", "namespaces", column: "instance_administrators_group_id", name: "fk_e8a145f3a7", on_delete: :nullify @@ -4610,8 +4606,8 @@ add_foreign_key "description_versions", "merge_requests", on_delete: :cascade add_foreign_key "design_management_designs", "issues", on_delete: :cascade add_foreign_key "design_management_designs", "projects", on_delete: :cascade - add_foreign_key "design_management_designs_versions", "design_management_designs", column: "design_id", name: "fk_03c671965c", on_delete: :cascade - add_foreign_key "design_management_designs_versions", "design_management_versions", column: "version_id", name: "fk_f4d25ba00c", on_delete: :cascade + add_foreign_key "design_management_designs_versions", "design_management_designs", column: "design_id", on_delete: :cascade + add_foreign_key "design_management_designs_versions", "design_management_versions", column: "version_id", on_delete: :cascade add_foreign_key "design_management_versions", "issues", on_delete: :cascade add_foreign_key "design_management_versions", "users", column: "author_id", name: "fk_c1440b4896", on_delete: :nullify add_foreign_key "design_user_mentions", "design_management_designs", column: "design_id", on_delete: :cascade @@ -4635,7 +4631,7 @@ add_foreign_key "epics", "users", column: "assignee_id", name: "fk_dccd3f98fc", on_delete: :nullify add_foreign_key "epics", "users", column: "author_id", name: "fk_3654b61b03", on_delete: :cascade add_foreign_key "epics", "users", column: "closed_by_id", name: "fk_aa5798e761", on_delete: :nullify - add_foreign_key "events", "namespaces", column: "group_id", name: "fk_61fbf6ca48", on_delete: :cascade + add_foreign_key "events", "namespaces", column: "group_id", on_delete: :cascade add_foreign_key "events", "projects", on_delete: :cascade add_foreign_key "events", "users", column: "author_id", name: "fk_edfd187b6f", on_delete: :cascade add_foreign_key "evidences", "releases", on_delete: :cascade @@ -4823,7 +4819,7 @@ add_foreign_key "project_statistics", "projects", on_delete: :cascade add_foreign_key "project_tracing_settings", "projects", on_delete: :cascade add_foreign_key "projects", "pool_repositories", name: "fk_6e5c14658a", on_delete: :nullify - add_foreign_key "projects", "users", column: "marked_for_deletion_by_user_id", name: "fk_25d8780d11", on_delete: :nullify + add_foreign_key "projects", "users", column: "marked_for_deletion_by_user_id", name: "fk_0a31cca0b8", on_delete: :nullify add_foreign_key "prometheus_alert_events", "projects", on_delete: :cascade add_foreign_key "prometheus_alert_events", "prometheus_alerts", on_delete: :cascade add_foreign_key "prometheus_alerts", "environments", on_delete: :cascade diff --git a/lib/gitlab/import_export/json/dedup_legacy_reader.rb b/lib/gitlab/import_export/json/dedup_legacy_reader.rb new file mode 100644 index 00000000000000..989e045bb2e1aa --- /dev/null +++ b/lib/gitlab/import_export/json/dedup_legacy_reader.rb @@ -0,0 +1,79 @@ +module Gitlab + module ImportExport + module JSON + class DedupLegacyReader < LegacyReader + LARGE_PROJECT_FILE_SIZE_BYTES = 500.megabyte + + def initialize(path, scope) + @path = path + @scope = scope + end + + def valid? + File.exist?(@path) && + File.size(@path) > LARGE_PROJECT_FILE_SIZE_BYTES && + Feature.enabled?(:dedup_project_import_metadata, @scope) + end + + protected + + def read_hash + dedup_tree(super) + end + + # This function removes duplicate entries from the given tree recursively + # by caching nodes it encounters repeatedly. We only consider nodes for + # which there can actually be multiple equivalent instances (e.g. strings, + # hashes and arrays, but not `nil`s, numbers or booleans.) + # + # The algorithm uses a recursive depth-first descent with 3 cases, starting + # with a root node (the tree/hash itself): + # - a node has already been cached; in this case we return it from the cache + # - a node has not been cached yet but should be; descend into its children + # - a node is neither cached nor qualifies for caching; this is a no-op + def dedup_tree(node, nodes_seen = {}) + if nodes_seen.key?(node) && distinguishable?(node) + yield nodes_seen[node] + elsif should_dedup?(node) + nodes_seen[node] = node + + case node + when Array + node.each_index do |idx| + dedup_tree(node[idx], nodes_seen) do |cached_node| + node[idx] = cached_node + end + end + when Hash + node.each do |k, v| + dedup_tree(v, nodes_seen) do |cached_node| + node[k] = cached_node + end + end + end + else + node + end + end + + # We do not need to consider nodes for which there cannot be multiple instances + def should_dedup?(node) + node && !(node.is_a?(Numeric) || node.is_a?(TrueClass) || node.is_a?(FalseClass)) + end + + # We can only safely de-dup values that are distinguishable. True value objects + # are always distinguishable by nature. Hashes however can represent entities, + # which are identified by ID, not value. We therefore disallow de-duping hashes + # that do not have an `id` field, since we might risk dropping entities that + # have equal attributes yet different identities. + def distinguishable?(node) + if node.is_a?(Hash) + node.key?('id') + else + true + end + end + end + end + end +end diff --git a/lib/gitlab/import_export/json/legacy_reader.rb b/lib/gitlab/import_export/json/legacy_reader.rb new file mode 100644 index 00000000000000..78d6d4f5977ae2 --- /dev/null +++ b/lib/gitlab/import_export/json/legacy_reader.rb @@ -0,0 +1,48 @@ +module Gitlab + module ImportExport + module JSON + class LegacyReader + include Gitlab::ImportExport::CommandLineUtil + + attr_reader :path + + def initialize(path) + @path = path + end + + def valid? + File.exist?(@path) + end + + def root_attributes(excluded_attributes = []) + tree_hash.reject do |key, _| + excluded_attributes.include?(key) + end + end + + def each_relation(key) + value = tree_hash[key] + return if value.nil? + + if value.is_a?(Array) + value.each.with_index do |item, idx| + yield(item, idx) + end + else + yield(value, 0) + end + end + + protected + + def tree_hash + @tree_hash ||= read_hash + end + + def read_hash + ActiveSupport::JSON.decode(IO.read(@path)) + end + end + end + end +end diff --git a/lib/gitlab/import_export/json/legacy_writer.rb b/lib/gitlab/import_export/json/legacy_writer.rb new file mode 100644 index 00000000000000..c4d3c98ce9485f --- /dev/null +++ b/lib/gitlab/import_export/json/legacy_writer.rb @@ -0,0 +1,71 @@ +module Gitlab + module ImportExport + module JSON + class LegacyWriter + include Gitlab::ImportExport::CommandLineUtil + + attr_reader :path + + def initialize(path) + @path = path + @last_array = nil + @keys = Set.new + + mkdir_p(File.dirname(@path)) + file.write('{}') + end + + def close + @file&.close + @file = nil + end + + def set(hash) + hash.each do |key, value| + write(key, value) + end + end + + def write(key, value) + raise RuntimeError, "key '#{key}' already written" if @keys.include?(key) + + # rewind by one byte, to overwrite '}' + file.pos = file.size-1 + + file.write(',') if @keys.any? + file.write(key.to_json) + file.write(':') + file.write(value.to_json) + file.write('}') + + @keys.add(key) + @last_array = nil + @last_array_count = nil + end + + def append(key, value) + unless @last_array == key + write(key, []) + + @last_array = key + @last_array_count = 0 + end + + # rewind by two bytes, to overwrite ']}' + file.pos = file.size-2 + + file.write(',') if @last_array_count > 0 + file.write(value.to_json) + file.write(']}') + @last_array_count += 1 + end + + private + + def file + @file ||= File.open(@path, "wb") + end + end + end + end +end diff --git a/lib/gitlab/import_export/json/ndjson_reader.rb b/lib/gitlab/import_export/json/ndjson_reader.rb new file mode 100644 index 00000000000000..a3640c84b4ed02 --- /dev/null +++ b/lib/gitlab/import_export/json/ndjson_reader.rb @@ -0,0 +1,47 @@ +module Gitlab + module ImportExport + module JSON + class NdjsonReader + include Gitlab::ImportExport::CommandLineUtil + + attr_reader :dir_path, :root + + def initialize(dir_path, root) + @dir_path = dir_path + @root = root + end + + def valid? + puts "Testing: #{file_path(root)}" + File.exist?(file_path(root)) + end + + def root_attributes(excluded_attributes = []) + each_relation(root) do |hash| + return hash.reject do |key, _| + excluded_attributes.include?(key) + end + end + + nil + end + + def each_relation(key) + path = File.join(@dir_path, "#{key}.ndjson") + return unless File.exist?(file_path(key)) + + File.foreach(file_path(key)).with_index do |line, line_num| + json = ActiveSupport::JSON.decode(line) + yield(json, line_num) + end + end + + private + + def file_path(key) + File.join(@dir_path, "#{key}.ndjson") + end + end + end + end +end diff --git a/lib/gitlab/import_export/json/ndjson_writer.rb b/lib/gitlab/import_export/json/ndjson_writer.rb new file mode 100644 index 00000000000000..06b19ef1d094c3 --- /dev/null +++ b/lib/gitlab/import_export/json/ndjson_writer.rb @@ -0,0 +1,48 @@ +module Gitlab + module ImportExport + module JSON + class NdjsonWriter + include Gitlab::ImportExport::CommandLineUtil + + attr_reader :dir_path + + def initialize(dir_path, root) + @dir_path = dir_path + @root = root + @files = {} + + mkdir_p(dir_path) + end + + def close + @files.values.each(&:close) + @files.clear + end + + def set(hash) + append(@root, hash) + end + + def write(key, value) + append(key, value) + end + + def append(key, value) + h = file(key) + h.write(value.to_json) + h.write("\n") + end + + private + + def file(key) + @files[key] ||= File.open(file_path(key), "wb") + end + + def file_path(key) + File.join(@dir_path, "#{key}.ndjson") + end + end + end + end +end diff --git a/lib/gitlab/import_export/json/streaming_serializer.rb b/lib/gitlab/import_export/json/streaming_serializer.rb new file mode 100644 index 00000000000000..a75ec40d60fb21 --- /dev/null +++ b/lib/gitlab/import_export/json/streaming_serializer.rb @@ -0,0 +1,106 @@ +# frozen_string_literal: true + +module Gitlab + module ImportExport + module JSON + class StreamingSerializer + include Gitlab::ImportExport::CommandLineUtil + + attr_reader :overrides + attr_reader :additional_relations + + BATCH_SIZE = 100 + + class Raw < String + def to_json + to_s + end + end + + def initialize(exportable, relations_tree) + @exportable = exportable + @relations_tree = relations_tree + @overrides = {} + @additional_relations = {} + end + + def execute(json_writers) + serialize_root(json_writers) + + includes.each do |relation_definition| + serialize_relation(json_writers, relation_definition) + end + end + + private + + def serialize_root(json_writers) + attributes = @exportable.as_json( + @relations_tree.merge(include: nil, preloads: nil)) + + data = attributes.merge(overrides) + + json_writers.each do |saver| + saver.set(data) + end + end + + def serialize_relation(json_writers, definition) + raise ArgumentError, 'definition needs to be Hash' unless definition.is_a?(Hash) + raise ArgumentError, 'definition needs to have exactly one Hash element' unless definition.one? + + key = definition.first.first + options = definition.first.second + + record = @exportable.public_send(key) # rubocop: disable GitlabSecurity/PublicSend + + if record.is_a?(ActiveRecord::Relation) + serialize_many_relations(json_writers, key, record, options) + else + serialize_single_relation(json_writers, key, record, options) + end + end + + def serialize_many_relations(json_writers, key, record, options) + key_preloads = preloads&.dig(key) + + record.in_batches(of: BATCH_SIZE) do |batch| # rubocop:disable Cop/InBatches + batch = batch.preload(key_preloads) if key_preloads + + batch.each do |item| + item = Raw.new(item.to_json(options)) + + json_writers.each do |saver| + saver.append(key, item) + end + end + end + + @additional_relations[key].to_a.each do |item| + item = Raw.new(item.to_json(options)) + + json_writers.each do |saver| + saver.append(key, item) + end + end + end + + def serialize_single_relation(json_writers, key, record, options) + json = Raw.new(record.to_json(options)) + + json_writers.each do |saver| + saver.write(key, json) + end + end + + def includes + @relations_tree[:include] + end + + def preloads + @relations_tree[:preload] + end + end + end + end +end diff --git a/lib/gitlab/import_export/project_tree_loader.rb b/lib/gitlab/import_export/project_tree_loader.rb deleted file mode 100644 index fc21858043d895..00000000000000 --- a/lib/gitlab/import_export/project_tree_loader.rb +++ /dev/null @@ -1,72 +0,0 @@ -# frozen_string_literal: true - -module Gitlab - module ImportExport - class ProjectTreeLoader - def load(path, dedup_entries: false) - tree_hash = ActiveSupport::JSON.decode(IO.read(path)) - - if dedup_entries - dedup_tree(tree_hash) - else - tree_hash - end - end - - private - - # This function removes duplicate entries from the given tree recursively - # by caching nodes it encounters repeatedly. We only consider nodes for - # which there can actually be multiple equivalent instances (e.g. strings, - # hashes and arrays, but not `nil`s, numbers or booleans.) - # - # The algorithm uses a recursive depth-first descent with 3 cases, starting - # with a root node (the tree/hash itself): - # - a node has already been cached; in this case we return it from the cache - # - a node has not been cached yet but should be; descend into its children - # - a node is neither cached nor qualifies for caching; this is a no-op - def dedup_tree(node, nodes_seen = {}) - if nodes_seen.key?(node) && distinguishable?(node) - yield nodes_seen[node] - elsif should_dedup?(node) - nodes_seen[node] = node - - case node - when Array - node.each_index do |idx| - dedup_tree(node[idx], nodes_seen) do |cached_node| - node[idx] = cached_node - end - end - when Hash - node.each do |k, v| - dedup_tree(v, nodes_seen) do |cached_node| - node[k] = cached_node - end - end - end - else - node - end - end - - # We do not need to consider nodes for which there cannot be multiple instances - def should_dedup?(node) - node && !(node.is_a?(Numeric) || node.is_a?(TrueClass) || node.is_a?(FalseClass)) - end - - # We can only safely de-dup values that are distinguishable. True value objects - # are always distinguishable by nature. Hashes however can represent entities, - # which are identified by ID, not value. We therefore disallow de-duping hashes - # that do not have an `id` field, since we might risk dropping entities that - # have equal attributes yet different identities. - def distinguishable?(node) - if node.is_a?(Hash) - node.key?('id') - else - true - end - end - end - end -end diff --git a/lib/gitlab/import_export/project_tree_restorer.rb b/lib/gitlab/import_export/project_tree_restorer.rb index aae07657ea0b88..e409216044ec07 100644 --- a/lib/gitlab/import_export/project_tree_restorer.rb +++ b/lib/gitlab/import_export/project_tree_restorer.rb @@ -13,14 +13,23 @@ def initialize(user:, shared:, project:) @user = user @shared = shared @project = project - @tree_loader = ProjectTreeLoader.new end def restore - @tree_hash = read_tree_hash - @project_members = @tree_hash.delete('project_members') + @relation_readers = [] + @relation_readers << ImportExport::JSON::NdjsonReader.new(File.join(shared.export_path, 'tree'), :project) + @relation_readers << ImportExport::JSON::DedupLegacyReader.new(File.join(shared.export_path, 'project.json'), project.group) + @relation_readers << ImportExport::JSON::LegacyReader.new(File.join(shared.export_path, 'project.json')) - RelationRenameService.rename(@tree_hash) + @relation_reader = @relation_readers.find(&:valid?) + raise RuntimeError, "missing relation reader for #{shared.export_path}" unless @relation_reader + + puts "Using: #{@relation_reader}" + @project_members = [] + + @relation_reader.each_relation('project_members') do |project_member| + @project_members << project_member + end if relation_tree_restorer.restore import_failure_service.with_retry(action: 'set_latest_merge_request_diff_ids!') do @@ -31,34 +40,19 @@ def restore else false end - rescue => e - @shared.error(e) - false + #rescue => e + # @shared.error(e) + # false end private - def large_project?(path) - File.size(path) >= LARGE_PROJECT_FILE_SIZE_BYTES - end - - def read_tree_hash - path = File.join(@shared.export_path, 'project.json') - dedup_entries = large_project?(path) && - Feature.enabled?(:dedup_project_import_metadata, project.group) - - @tree_loader.load(path, dedup_entries: dedup_entries) - rescue => e - Rails.logger.error("Import/Export error: #{e.message}") # rubocop:disable Gitlab/RailsLogger - raise Gitlab::ImportExport::Error.new('Incorrect JSON format') - end - def relation_tree_restorer @relation_tree_restorer ||= RelationTreeRestorer.new( user: @user, shared: @shared, importable: @project, - tree_hash: @tree_hash, + relation_reader: @relation_reader, object_builder: object_builder, members_mapper: members_mapper, relation_factory: relation_factory, diff --git a/lib/gitlab/import_export/project_tree_saver.rb b/lib/gitlab/import_export/project_tree_saver.rb index 386a4cfdfc6d59..65c718daf0f5b3 100644 --- a/lib/gitlab/import_export/project_tree_saver.rb +++ b/lib/gitlab/import_export/project_tree_saver.rb @@ -3,41 +3,33 @@ module Gitlab module ImportExport class ProjectTreeSaver - attr_reader :full_path - def initialize(project:, current_user:, shared:, params: {}) @params = params @project = project @current_user = current_user @shared = shared - @full_path = File.join(@shared.export_path, ImportExport.project_filename) end def save - project_tree = tree_saver.serialize(@project, reader.project_tree) - fix_project_tree(project_tree) - tree_saver.save(project_tree, @shared.export_path, ImportExport.project_filename) + savers = [] + savers << ImportExport::JSON::NdjsonWriter.new(File.join(@shared.export_path, "tree"), :project) + savers << ImportExport::JSON::LegacyWriter.new(File.join(@shared.export_path, "project.json")) + + serializer = ImportExport::JSON::StreamingSerializer.new(@project, reader.project_tree) + serializer.overrides['description'] = @params[:description] if @params[:description].present? + serializer.additional_relations['project_members'] = group_members_array + serializer.execute(savers) true - rescue => e - @shared.error(e) - false + ensure + savers.each(&:close) + #rescue => e + # @shared.error(e) + # false end private - # Aware that the resulting hash needs to be pure-hash and - # does not include any AR objects anymore, only objects that run `.to_json` - def fix_project_tree(project_tree) - if @params[:description].present? - project_tree['description'] = @params[:description] - end - - project_tree['project_members'] += group_members_array - - RelationRenameService.add_new_associations(project_tree) - end - def reader @reader ||= Gitlab::ImportExport::Reader.new(shared: @shared) end @@ -59,10 +51,6 @@ def group_members GroupMembersFinder.new(@project.group).execute.where.not(user_id: non_null_user_ids) end - - def tree_saver - @tree_saver ||= RelationTreeSaver.new - end end end end diff --git a/lib/gitlab/import_export/relation_tree_restorer.rb b/lib/gitlab/import_export/relation_tree_restorer.rb index cc01d70db16f1b..12d8952c1b4d3b 100644 --- a/lib/gitlab/import_export/relation_tree_restorer.rb +++ b/lib/gitlab/import_export/relation_tree_restorer.rb @@ -9,13 +9,13 @@ class RelationTreeRestorer attr_reader :user attr_reader :shared attr_reader :importable - attr_reader :tree_hash + attr_reader :relation_reader - def initialize(user:, shared:, importable:, tree_hash:, members_mapper:, object_builder:, relation_factory:, reader:) + def initialize(user:, shared:, importable:, relation_reader:, members_mapper:, object_builder:, relation_factory:, reader:) @user = user @shared = shared @importable = importable - @tree_hash = tree_hash + @relation_reader = relation_reader @members_mapper = members_mapper @object_builder = object_builder @relation_factory = relation_factory @@ -51,18 +51,8 @@ def create_relations! end def process_relation!(relation_key, relation_definition) - data_hashes = @tree_hash.delete(relation_key) - return unless data_hashes - - # we do not care if we process array or hash - data_hashes = [data_hashes] unless data_hashes.is_a?(Array) - - relation_index = 0 - - # consume and remove objects from memory - while data_hash = data_hashes.shift + @relation_reader.each_relation(relation_key) do |data_hash, relation_index| process_relation_item!(relation_key, relation_definition, relation_index, data_hash) - relation_index += 1 end end @@ -110,10 +100,7 @@ def relations end def update_params! - params = @tree_hash.reject do |key, _| - relations.include?(key) - end - + params = @relation_reader.root_attributes(relations.keys) params = params.merge(present_override_params) # Cleaning all imported and overridden params -- GitLab From 770999b567d70f9c8d37c8697d5ec09425af44b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20Trzci=C5=84ski?= Date: Tue, 4 Feb 2020 18:24:57 +0100 Subject: [PATCH 2/2] Add tasks --- config/initializers/7_prometheus_metrics.rb | 2 - lib/tasks/gitlab/import_export/export.rake | 190 ++++++++++++++++++++ lib/tasks/gitlab/import_export/import.rake | 47 ++++- 3 files changed, 231 insertions(+), 8 deletions(-) create mode 100644 lib/tasks/gitlab/import_export/export.rake diff --git a/config/initializers/7_prometheus_metrics.rb b/config/initializers/7_prometheus_metrics.rb index aa2601ea65037c..114ac605ccf445 100644 --- a/config/initializers/7_prometheus_metrics.rb +++ b/config/initializers/7_prometheus_metrics.rb @@ -10,8 +10,6 @@ def prometheus_default_multiproc_dir Rails.root.join('tmp/prometheus_multiproc_dir/unicorn') elsif Gitlab::Runtime.puma? Rails.root.join('tmp/prometheus_multiproc_dir/puma') - else - Rails.root.join('tmp/prometheus_multiproc_dir') end end diff --git a/lib/tasks/gitlab/import_export/export.rake b/lib/tasks/gitlab/import_export/export.rake new file mode 100644 index 00000000000000..bb952be8834073 --- /dev/null +++ b/lib/tasks/gitlab/import_export/export.rake @@ -0,0 +1,190 @@ +# frozen_string_literal: true + +namespace :gitlab do + namespace :import_export do + desc 'GitLab | Import/Export | EXPERIMENTAL | Export large project archives' + task :export, [:username, :namespace_path, :project_path, :archive_path] => :gitlab_environment do |_t, args| + # Load it here to avoid polluting Rake tasks with Sidekiq test warnings + require 'sidekiq/testing' + + sleep(15) + + class CopyFileStrategy < Gitlab::ImportExport::AfterExportStrategies::BaseAfterExportStrategy + def initialize(archive_path:) + @archive_path = archive_path + end + + private + + def strategy_execute + FileUtils.mv(project.export_file.path, @archive_path) + end + end + + warn_user_is_not_gitlab + + if ENV['IMPORT_DEBUG'].present? + ActiveRecord::Base.logger = Logger.new(STDOUT) + Gitlab::Metrics::Exporter::SidekiqExporter.instance.start + end + + def with_count_queries(&block) + count = 0 + + counter_f = ->(name, started, finished, unique_id, payload) { + unless payload[:name].in? %w[ CACHE SCHEMA ] + count += 1 + end + } + + ActiveSupport::Notifications.subscribed(counter_f, "sql.active_record", &block) + + puts "Number of SQL calls: #{count}" + end + + def with_measuretime + timing = Benchmark.realtime do + yield + end + puts "Time to finish: #{timing}" + end + + current_user = User.find_by_username(args.username) + namespace = Namespace.find_by_full_path(args.namespace_path) + project = namespace.projects.find_by_path(args.project_path) + + with_count_queries do + with_measuretime do + ::Projects::ImportExport::ExportService.new(project, current_user) + .execute(CopyFileStrategy.new(archive_path: args.archive_path)) + end + end + + puts "Memory usage: #{Gitlab::Metrics::System.memory_usage.to_f / 1024 / 1024} MiB" + puts "GC calls: #{GC.stat[:count]}" + puts "GC major calls: #{GC.stat[:major_gc_count]}" + puts "Label: #{Prometheus::PidProvider.worker_id}" + + sleep(30) + + puts 'Done!' + rescue StandardError => e + puts "Exception: #{e.message}" + puts e.backtrace + exit 1 + end + end +end + +class GitlabProjectImport + def initialize(opts) + @project_path = opts.fetch(:project_path) + @file_path = opts.fetch(:file_path) + @namespace = Namespace.find_by_full_path(opts.fetch(:namespace_path)) + @current_user = User.find_by_username(opts.fetch(:username)) + end + + def import + show_import_start_message + + run_isolated_sidekiq_job + + show_import_failures_count + + if @project&.import_state&.last_error + puts "ERROR: #{@project.import_state.last_error}" + exit 1 + elsif @project.errors.any? + puts "ERROR: #{@project.errors.full_messages.join(', ')}" + exit 1 + else + puts 'Done!' + end + rescue StandardError => e + puts "Exception: #{e.message}" + puts e.backtrace + exit 1 + end + + private + + def with_request_store + RequestStore.begin! + yield + ensure + RequestStore.end! + RequestStore.clear! + end + + # We want to ensure that all Sidekiq jobs are executed + # synchronously as part of that process. + # This ensures that all expensive operations do not escape + # to general Sidekiq clusters/nodes. + def run_isolated_sidekiq_job + Sidekiq::Testing.fake! do + with_request_store do + @project = create_project + + execute_sidekiq_job + end + true + end + end + + def create_project + # We are disabling ObjectStorage for `import` + # as it is too slow to handle big archives: + # 1. DB transaction timeouts on upload + # 2. Download of archive before unpacking + disable_upload_object_storage do + service = Projects::GitlabProjectsImportService.new( + @current_user, + { + namespace_id: @namespace.id, + path: @project_path, + file: File.open(@file_path) + } + ) + + service.execute + end + end + + def execute_sidekiq_job + Sidekiq::Worker.drain_all + end + + def disable_upload_object_storage + overwrite_uploads_setting('background_upload', false) do + overwrite_uploads_setting('direct_upload', false) do + yield + end + end + end + + def overwrite_uploads_setting(key, value) + old_value = Settings.uploads.object_store[key] + Settings.uploads.object_store[key] = value + + yield + + ensure + Settings.uploads.object_store[key] = old_value + end + + def full_path + "#{@namespace.full_path}/#{@project_path}" + end + + def show_import_start_message + puts "Importing GitLab export: #{@file_path} into GitLab" \ + " #{full_path}" \ + " as #{@current_user.name}" + end + + def show_import_failures_count + return unless @project.import_failures.exists? + + puts "Total number of not imported relations: #{@project.import_failures.count}" + end +end diff --git a/lib/tasks/gitlab/import_export/import.rake b/lib/tasks/gitlab/import_export/import.rake index 1f38b31c1e08ae..1c0759338db48c 100644 --- a/lib/tasks/gitlab/import_export/import.rake +++ b/lib/tasks/gitlab/import_export/import.rake @@ -16,18 +16,53 @@ namespace :gitlab do # Load it here to avoid polluting Rake tasks with Sidekiq test warnings require 'sidekiq/testing' + sleep(15) + warn_user_is_not_gitlab if ENV['IMPORT_DEBUG'].present? ActiveRecord::Base.logger = Logger.new(STDOUT) + Gitlab::Metrics::Exporter::SidekiqExporter.instance.start + end + + def with_count_queries(&block) + count = 0 + + counter_f = ->(name, started, finished, unique_id, payload) { + unless payload[:name].in? %w[ CACHE SCHEMA ] + count += 1 + end + } + + ActiveSupport::Notifications.subscribed(counter_f, "sql.active_record", &block) + + puts "Number of SQL calls: #{count}" + end + + def with_measuretime + timing = Benchmark.realtime do + yield + end + puts "Time to finish: #{timing}" end - GitlabProjectImport.new( - namespace_path: args.namespace_path, - project_path: args.project_path, - username: args.username, - file_path: args.archive_path - ).import + with_count_queries do + with_measuretime do + GitlabProjectImport.new( + namespace_path: args.namespace_path, + project_path: args.project_path, + username: args.username, + file_path: args.archive_path + ).import + end + end + + puts "Memory usage: #{Gitlab::Metrics::System.memory_usage.to_f / 1024 / 1024} MiB" + puts "GC calls: #{GC.stat[:count]}" + puts "GC major calls: #{GC.stat[:major_gc_count]}" + puts "Label: #{Prometheus::PidProvider.worker_id}" + + sleep(30) end end end -- GitLab