From 2708d5597a84328ab2333da8bd228a3d31960d5b Mon Sep 17 00:00:00 2001 From: ruslandoga <67764432+ruslandoga@users.noreply.github.com> Date: Thu, 14 Mar 2024 12:31:34 +0800 Subject: [PATCH 1/7] encode/decode date range in filenames --- config/.env.dev | 1 + config/.env.test | 1 + config/runtime.exs | 8 +- lib/plausible/imported/csv_importer.ex | 171 +++++++++++++----- lib/plausible/s3.ex | 61 +++++++ lib/workers/export_csv.ex | 19 +- test/plausible/config_test.exs | 20 +- test/plausible/imported/csv_importer_test.exs | 82 +++++---- test/plausible/s3_test.exs | 4 + 9 files changed, 265 insertions(+), 102 deletions(-) create mode 100644 test/plausible/s3_test.exs diff --git a/config/.env.dev b/config/.env.dev index 32ab90a455c0..58e4ca827d15 100644 --- a/config/.env.dev +++ b/config/.env.dev @@ -27,3 +27,4 @@ S3_SECRET_ACCESS_KEY=minioadmin S3_REGION=us-east-1 S3_ENDPOINT=http://localhost:10000 S3_EXPORTS_BUCKET=dev-exports +S3_IMPORTS_BUCKET=dev-imports diff --git a/config/.env.test b/config/.env.test index d41cde0ebf5b..8fe3862a767b 100644 --- a/config/.env.test +++ b/config/.env.test @@ -22,3 +22,4 @@ S3_SECRET_ACCESS_KEY=minioadmin S3_REGION=us-east-1 S3_ENDPOINT=http://localhost:10000 S3_EXPORTS_BUCKET=test-exports +S3_IMPORTS_BUCKET=test-imports diff --git a/config/runtime.exs b/config/runtime.exs index cd1fbbb75a86..ad64d357fa18 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -736,6 +736,10 @@ unless s3_disabled? do %{ name: "S3_EXPORTS_BUCKET", example: "my-csv-exports-bucket" + }, + %{ + name: "S3_IMPORTS_BUCKET", + example: "my-csv-imports-bucket" } ] @@ -771,5 +775,7 @@ unless s3_disabled? do host: s3_host, port: s3_port - config :plausible, Plausible.S3, exports_bucket: s3_env_value.("S3_EXPORTS_BUCKET") + config :plausible, Plausible.S3, + exports_bucket: s3_env_value.("S3_EXPORTS_BUCKET"), + imports_bucket: s3_env_value.("S3_IMPORTS_BUCKET") end diff --git a/lib/plausible/imported/csv_importer.ex b/lib/plausible/imported/csv_importer.ex index d2d02e5426bb..137a2353c4ce 100644 --- a/lib/plausible/imported/csv_importer.ex +++ b/lib/plausible/imported/csv_importer.ex @@ -20,7 +20,13 @@ defmodule Plausible.Imported.CSVImporter do @impl true def import_data(site_import, opts) do - %{id: import_id, site_id: site_id} = site_import + %{ + id: import_id, + site_id: site_id, + start_date: start_date, + end_date: end_date + } = site_import + uploads = Keyword.fetch!(opts, :uploads) %{access_key_id: s3_access_key_id, secret_access_key: s3_secret_access_key} = @@ -31,52 +37,36 @@ defmodule Plausible.Imported.CSVImporter do |> Keyword.replace!(:pool_size, 1) |> Ch.start_link() - ranges = - Enum.map(uploads, fn upload -> - %{"filename" => filename, "s3_url" => s3_url} = upload - - ".csv" = Path.extname(filename) - table = Path.rootname(filename) - ensure_importable_table!(table) - - s3_structure = input_structure!(table) - - statement = - """ - INSERT INTO {table:Identifier} \ - SELECT {site_id:UInt64} AS site_id, *, {import_id:UInt64} AS import_id \ - FROM s3({s3_url:String},{s3_access_key_id:String},{s3_secret_access_key:String},{s3_format:String},{s3_structure:String})\ - """ - - params = - %{ - "table" => table, - "site_id" => site_id, - "import_id" => import_id, - "s3_url" => s3_url, - "s3_access_key_id" => s3_access_key_id, - "s3_secret_access_key" => s3_secret_access_key, - "s3_format" => "CSVWithNames", - "s3_structure" => s3_structure - } - - Ch.query!(ch, statement, params, timeout: :infinity) - - %Ch.Result{rows: [[min_date, max_date]]} = - Ch.query!( - ch, - "SELECT min(date), max(date) FROM {table:Identifier} WHERE site_id = {site_id:UInt64} AND import_id = {import_id:UInt64}", - %{"table" => table, "site_id" => site_id, "import_id" => import_id} - ) - - Date.range(min_date, max_date) - end) - - {:ok, - %{ - start_date: Enum.min_by(ranges, & &1.first, Date).first, - end_date: Enum.max_by(ranges, & &1.last, Date).last - }} + Enum.each(uploads, fn upload -> + %{"filename" => filename, "s3_url" => s3_url} = upload + + {table, _, _} = parse_filename!(filename) + s3_structure = input_structure!(table) + + statement = + """ + INSERT INTO {table:Identifier} \ + SELECT {site_id:UInt64} AS site_id, *, {import_id:UInt64} AS import_id \ + FROM s3({s3_url:String},{s3_access_key_id:String},{s3_secret_access_key:String},{s3_format:String},{s3_structure:String}) \ + WHERE date >= {start_date:Date} AND date <= {end_date:Date}\ + """ + + params = + %{ + "table" => table, + "site_id" => site_id, + "import_id" => import_id, + "s3_url" => s3_url, + "s3_access_key_id" => s3_access_key_id, + "s3_secret_access_key" => s3_secret_access_key, + "s3_format" => "CSVWithNames", + "s3_structure" => s3_structure, + "start_date" => start_date, + "end_date" => end_date + } + + Ch.query!(ch, statement, params, timeout: :infinity) + end) rescue # we are cancelling on any argument or ClickHouse errors e in [ArgumentError, Ch.Error] -> @@ -103,12 +93,93 @@ defmodule Plausible.Imported.CSVImporter do "date Date, visitors UInt64, pageviews UInt64, bounces UInt64, visits UInt64, visit_duration UInt64" } + @doc """ + Extracts min/max date range from a list of uploads. + + Examples: + + iex> date_range([ + ...> %{"filename" => "imported_devices_20190101_20210101.csv"}, + ...> "imported_pages_20200101_20220101.csv" + ...> ]) + Date.range(~D[2019-01-01], ~D[2022-01-01]) + + iex> date_range([]) + ** (ArgumentError) empty uploads + + """ + @spec date_range([String.t() | %{String.t() => String.t()}, ...]) :: Date.Range.t() + def date_range([_ | _] = uploads), do: date_range(uploads, _start_date = nil, _end_date = nil) + def date_range([]), do: raise(ArgumentError, "empty uploads") + + defp date_range([upload | uploads], prev_start_date, prev_end_date) do + filename = + case upload do + %{"filename" => filename} -> filename + filename when is_binary(filename) -> filename + end + + {_table, start_date, end_date} = parse_filename!(filename) + + start_date = + if prev_start_date do + min_date(start_date, prev_start_date) + else + start_date + end + + end_date = + if prev_end_date do + max_date(end_date, prev_end_date) + else + end_date + end + + date_range(uploads, start_date, end_date) + end + + defp date_range([], first, last), do: Date.range(first, last) + + defp min_date(d1, d2) do + if Date.compare(d1, d2) == :lt, do: d1, else: d2 + end + + defp max_date(d1, d2) do + if Date.compare(d1, d2) == :gt, do: d1, else: d2 + end + + @spec parse_date!(String.t()) :: Date.t() + defp parse_date!(date) do + date |> Timex.parse!("{YYYY}{0M}{0D}") |> NaiveDateTime.to_date() + end + + @doc """ + Extracts table name and min/max dates from the filename. + + Examples: + + iex> parse_filename!("my_data.csv") + ** (ArgumentError) invalid filename + + iex> parse_filename!("imported_devices_00010101_20250101.csv") + {"imported_devices", ~D[0001-01-01], ~D[2025-01-01]} + + """ + @spec parse_filename!(String.t()) :: + {table :: String.t(), start_date :: Date.t(), end_date :: Date.t()} + def parse_filename!(filename) + for {table, input_structure} <- input_structures do defp input_structure!(unquote(table)), do: unquote(input_structure) - defp ensure_importable_table!(unquote(table)), do: :ok + + def parse_filename!( + <> + ) do + {unquote(table), parse_date!(start_date), parse_date!(end_date)} + end end - defp ensure_importable_table!(table) do - raise ArgumentError, "table #{table} is not supported for data import" + def parse_filename!(_filename) do + raise ArgumentError, "invalid filename" end end diff --git a/lib/plausible/s3.ex b/lib/plausible/s3.ex index 9086e002e15e..2768ce9eff35 100644 --- a/lib/plausible/s3.ex +++ b/lib/plausible/s3.ex @@ -18,9 +18,64 @@ defmodule Plausible.S3 do @spec exports_bucket :: String.t() def exports_bucket, do: config(:exports_bucket) + @doc """ + Returns the pre-configured S3 bucket for CSV imports. + + config :plausible, Plausible.S3, + imports_bucket: System.fetch_env!("S3_IMPORTS_BUCKET") + + Example: + + iex> imports_bucket() + "test-imports" + + """ + @spec imports_bucket :: String.t() + def imports_bucket, do: config(:imports_bucket) + defp config, do: Application.fetch_env!(:plausible, __MODULE__) defp config(key), do: Keyword.fetch!(config(), key) + @doc """ + Presigns an upload for an imported file. + + In the current implementation the bucket always goes into the path component. + + Example: + + iex> %{ + ...> s3_url: "http://localhost:10000/test-imports/123/imported_browsers.csv", + ...> presigned_url: "http://localhost:10000/test-imports/123/imported_browsers.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=minioadmin" <> _ + ...> } = import_presign_upload(_site_id = 123, _filename = "imported_browsers.csv") + + """ + def import_presign_upload(site_id, filename) do + config = ExAws.Config.new(:s3) + s3_path = Path.join(to_string(site_id), filename) + bucket = imports_bucket() + {:ok, presigned_url} = ExAws.S3.presigned_url(config, :put, bucket, s3_path) + %{s3_url: extract_s3_url(presigned_url), presigned_url: presigned_url} + end + + # to make ClickHouse see MinIO in dev and test envs we replace + # the host in the S3 URL with whatever's set in S3_CLICKHOUSE_HOST env var + if Mix.env() in [:dev, :test, :small_dev, :small_test] do + defp extract_s3_url(presigned_url) do + [s3_url, _] = String.split(presigned_url, "?") + + if ch_host = System.get_env("S3_CLICKHOUSE_HOST") do + URI.to_string(%URI{URI.parse(s3_url) | host: ch_host}) + else + s3_url + end + end + else + defp extract_s3_url(presigned_url) do + [s3_url, _] = String.split(presigned_url, "?") + s3_url + end + end + @doc """ Chunks and uploads Zip archive to the provided S3 destination. @@ -77,6 +132,12 @@ defmodule Plausible.S3 do @doc """ Returns `access_key_id` and `secret_access_key` to be used by ClickHouse during imports from S3. + + Example: + + iex> import_clickhouse_credentials() + %{access_key_id: "minioadmin", secret_access_key: "minioadmin"} + """ @spec import_clickhouse_credentials :: %{access_key_id: String.t(), secret_access_key: String.t()} diff --git a/lib/workers/export_csv.ex b/lib/workers/export_csv.ex index 1eb3d42e1ecc..e5c803e06466 100644 --- a/lib/workers/export_csv.ex +++ b/lib/workers/export_csv.ex @@ -25,13 +25,14 @@ defmodule Plausible.Workers.ExportCSV do |> Keyword.replace!(:pool_size, 1) |> Ch.start_link() + # NOTE: what if 1970-01-01? # NOTE: should we use site.timezone? - # %Ch.Result{rows: [[min_date, max_date]]} = - # Ch.query!( - # ch, - # "SELECT toDate(min(timestamp)), toDate(max(timestamp)) FROM events_v2 WHERE site_id={site_id:UInt64}", - # %{"site_id" => site_id} - # ) + %Ch.Result{rows: [[min_date, max_date]]} = + Ch.query!( + ch, + "SELECT toDate(min(timestamp)), toDate(max(timestamp)) FROM events_v2 WHERE site_id={site_id:UInt64}", + %{"site_id" => site_id} + ) download_url = DBConnection.run( @@ -39,8 +40,10 @@ defmodule Plausible.Workers.ExportCSV do fn conn -> conn |> Plausible.Exports.stream_archive( - # date_range: Date.range(min_date, max_date) - Plausible.Exports.export_queries(site_id, extname: ".csv"), + Plausible.Exports.export_queries(site_id, + date_range: Date.range(min_date, max_date), + extname: ".csv" + ), format: "CSVWithNames" ) |> Plausible.S3.export_upload_multipart(s3_bucket, s3_path, s3_config_overrides(args)) diff --git a/test/plausible/config_test.exs b/test/plausible/config_test.exs index 5ba0005bf812..acde4f58e5e9 100644 --- a/test/plausible/config_test.exs +++ b/test/plausible/config_test.exs @@ -198,7 +198,8 @@ defmodule Plausible.ConfigTest do {"S3_SECRET_ACCESS_KEY", nil}, {"S3_REGION", nil}, {"S3_ENDPOINT", nil}, - {"S3_EXPORTS_BUCKET", nil} + {"S3_EXPORTS_BUCKET", nil}, + {"S3_IMPORTS_BUCKET", nil} ] result = @@ -211,13 +212,14 @@ defmodule Plausible.ConfigTest do assert %ArgumentError{} = result assert Exception.message(result) == """ - Missing S3 configuration. Please set S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_REGION, S3_ENDPOINT, S3_EXPORTS_BUCKET environment variable(s): + Missing S3 configuration. Please set S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_REGION, S3_ENDPOINT, S3_EXPORTS_BUCKET, S3_IMPORTS_BUCKET environment variable(s): \tS3_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE \tS3_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY \tS3_REGION=us-east-1 \tS3_ENDPOINT=https://.r2.cloudflarestorage.com \tS3_EXPORTS_BUCKET=my-csv-exports-bucket + \tS3_IMPORTS_BUCKET=my-csv-imports-bucket """ end @@ -227,7 +229,8 @@ defmodule Plausible.ConfigTest do {"S3_SECRET_ACCESS_KEY", nil}, {"S3_REGION", "eu-north-1"}, {"S3_ENDPOINT", nil}, - {"S3_EXPORTS_BUCKET", "my-exports"} + {"S3_EXPORTS_BUCKET", "my-exports"}, + {"S3_IMPORTS_BUCKET", nil} ] result = @@ -240,10 +243,11 @@ defmodule Plausible.ConfigTest do assert %ArgumentError{} = result assert Exception.message(result) == """ - Missing S3 configuration. Please set S3_SECRET_ACCESS_KEY, S3_ENDPOINT environment variable(s): + Missing S3 configuration. Please set S3_SECRET_ACCESS_KEY, S3_ENDPOINT, S3_IMPORTS_BUCKET environment variable(s): \tS3_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY \tS3_ENDPOINT=https://.r2.cloudflarestorage.com + \tS3_IMPORTS_BUCKET=my-csv-imports-bucket """ end @@ -253,7 +257,8 @@ defmodule Plausible.ConfigTest do {"S3_SECRET_ACCESS_KEY", "minioadmin"}, {"S3_REGION", "us-east-1"}, {"S3_ENDPOINT", "http://localhost:6000"}, - {"S3_EXPORTS_BUCKET", "my-exports"} + {"S3_EXPORTS_BUCKET", "my-exports"}, + {"S3_IMPORTS_BUCKET", "my-imports"} ] config = runtime_config(env) @@ -266,8 +271,9 @@ defmodule Plausible.ConfigTest do s3: [scheme: "http://", host: "localhost", port: 6000] ] - assert get_in(runtime_config(env), [:plausible, Plausible.S3]) == [ - exports_bucket: "my-exports" + assert get_in(config, [:plausible, Plausible.S3]) == [ + exports_bucket: "my-exports", + imports_bucket: "my-imports" ] end end diff --git a/test/plausible/imported/csv_importer_test.exs b/test/plausible/imported/csv_importer_test.exs index 60322a3852e9..5b988483cd5d 100644 --- a/test/plausible/imported/csv_importer_test.exs +++ b/test/plausible/imported/csv_importer_test.exs @@ -4,6 +4,8 @@ defmodule Plausible.Imported.CSVImporterTest do alias Testcontainers.MinioContainer require SiteImport + doctest CSVImporter, import: true + @moduletag :minio setup_all do @@ -55,9 +57,12 @@ defmodule Plausible.Imported.CSVImporterTest do "imported_visitors" ] + start_date = "20231001" + end_date = "20240102" + uploads = Enum.map(tables, fn table -> - filename = "#{table}.csv" + filename = "#{table}_#{start_date}_#{end_date}.csv" %{ "filename" => filename, @@ -65,11 +70,12 @@ defmodule Plausible.Imported.CSVImporterTest do } end) + date_range = CSVImporter.date_range(uploads) + assert {:ok, job} = CSVImporter.new_import(site, user, - # to satisfy the non null constraints on the table I'm providing "0" dates (according to ClickHouse) - start_date: ~D[1970-01-01], - end_date: ~D[1970-01-01], + start_date: date_range.first, + end_date: date_range.last, uploads: uploads ) @@ -80,8 +86,8 @@ defmodule Plausible.Imported.CSVImporterTest do %{ id: ^import_id, source: :csv, - start_date: ~D[1970-01-01], - end_date: ~D[1970-01-01], + start_date: ~D[2023-10-01], + end_date: ~D[2024-01-02], status: SiteImport.pending() } ] = Plausible.Imported.list_all_imports(site) @@ -97,7 +103,7 @@ defmodule Plausible.Imported.CSVImporterTest do test "imports tables from S3", %{site: site, user: user, s3: s3, container: minio} do csvs = [ %{ - name: "imported_browsers.csv", + name: "imported_browsers_20211230_20211231.csv", body: """ "date","browser","visitors","visits","visit_duration","bounces" "2021-12-30","Amazon Silk",2,2,0,2 @@ -122,7 +128,7 @@ defmodule Plausible.Imported.CSVImporterTest do """ }, %{ - name: "imported_devices.csv", + name: "imported_devices_20211230_20220102.csv", body: """ "date","device","visitors","visits","visit_duration","bounces" "2021-12-30","Desktop",25,28,75,27 @@ -140,7 +146,7 @@ defmodule Plausible.Imported.CSVImporterTest do """ }, %{ - name: "imported_entry_pages.csv", + name: "imported_entry_pages_20211230_20211231.csv", body: """ "date","visitors","entrances","visit_duration","bounces","entry_page" "2021-12-30",6,6,0,6,"/14776416252794997127" @@ -173,7 +179,7 @@ defmodule Plausible.Imported.CSVImporterTest do """ }, %{ - name: "imported_exit_pages.csv", + name: "imported_exit_pages_20211230_20211231.csv", body: """ "date","visitors","exits","exit_page" "2021-12-30",6,6,"/14776416252794997127" @@ -198,7 +204,7 @@ defmodule Plausible.Imported.CSVImporterTest do """ }, %{ - name: "imported_locations.csv", + name: "imported_locations_20211230_20211231.csv", body: """ "date","country","region","city","visitors","visits","visit_duration","bounces" "2021-12-30","AU","",0,1,1,43,0 @@ -235,7 +241,7 @@ defmodule Plausible.Imported.CSVImporterTest do """ }, %{ - name: "imported_operating_systems.csv", + name: "imported_operating_systems_20211230_20220101.csv", body: """ "date","operating_system","visitors","visits","visit_duration","bounces" "2021-12-30","Android",25,26,254,24 @@ -255,7 +261,7 @@ defmodule Plausible.Imported.CSVImporterTest do """ }, %{ - name: "imported_pages.csv", + name: "imported_pages_20211230_20220101.csv", body: """ "date","visitors","pageviews","exits","time_on_page","hostname","page" "2021-12-30",1,1,0,43,"lucky.numbers.com","/14776416252794997127" @@ -277,7 +283,7 @@ defmodule Plausible.Imported.CSVImporterTest do """ }, %{ - name: "imported_sources.csv", + name: "imported_sources_20211230_20220106.csv", body: """ "date","source","utm_medium","utm_campaign","utm_content","utm_term","visitors","visits","visit_duration","bounces" "2021-12-30","","","","","",25,26,254,24 @@ -307,7 +313,7 @@ defmodule Plausible.Imported.CSVImporterTest do """ }, %{ - name: "imported_visitors.csv", + name: "imported_visitors_20111225_20111230.csv", body: """ "date","visitors","pageviews","bounces","visits","visit_duration" "2011-12-25",5,50,2,7,8640 @@ -327,13 +333,12 @@ defmodule Plausible.Imported.CSVImporterTest do %{"filename" => name, "s3_url" => minio_url(minio, "imports", key)} end + date_range = CSVImporter.date_range(uploads) + {:ok, job} = - CSVImporter.new_import( - site, - user, - # to satisfy the non null constraints on the table I'm providing "0" dates (according to ClickHouse) - start_date: ~D[1970-01-01], - end_date: ~D[1970-01-01], + CSVImporter.new_import(site, user, + start_date: date_range.first, + end_date: date_range.last, uploads: uploads ) @@ -341,7 +346,6 @@ defmodule Plausible.Imported.CSVImporterTest do assert :ok = Plausible.Workers.ImportAnalytics.perform(job) - # on successfull import the start and end dates are updated assert %SiteImport{ start_date: ~D[2011-12-25], end_date: ~D[2022-01-06], @@ -355,7 +359,7 @@ defmodule Plausible.Imported.CSVImporterTest do test "fails on invalid CSV", %{site: site, user: user, s3: s3, container: minio} do csvs = [ %{ - name: "imported_browsers.csv", + name: "imported_browsers_20211230_20211231.csv", body: """ "date","browser","visitors","visits","visit_duration","bounces" "2021-12-30","Amazon Silk",2,2,0,2 @@ -368,7 +372,7 @@ defmodule Plausible.Imported.CSVImporterTest do """ }, %{ - name: "imported_devices.csv", + name: "imported_devices_20211230_20211231.csv", body: """ "date","device","visitors","visit_duration","bounces" "2021-12-30","Desktop",28,ehhhh.... @@ -383,12 +387,12 @@ defmodule Plausible.Imported.CSVImporterTest do %{"filename" => name, "s3_url" => minio_url(minio, "imports", key)} end + date_range = CSVImporter.date_range(uploads) + {:ok, job} = - CSVImporter.new_import( - site, - user, - start_date: ~D[1970-01-01], - end_date: ~D[1970-01-01], + CSVImporter.new_import(site, user, + start_date: date_range.first, + end_date: date_range.last, uploads: uploads ) @@ -508,12 +512,12 @@ defmodule Plausible.Imported.CSVImporterTest do end) # run importer + date_range = CSVImporter.date_range(uploads) + {:ok, job} = - CSVImporter.new_import( - site, - user, - start_date: ~D[1970-01-01], - end_date: ~D[1970-01-01], + CSVImporter.new_import(site, user, + start_date: date_range.first, + end_date: date_range.last, uploads: uploads ) @@ -533,7 +537,13 @@ defmodule Plausible.Imported.CSVImporterTest do end defp minio_url(minio, bucket, key) do - port = minio |> MinioContainer.connection_opts() |> Keyword.fetch!(:port) - Path.join(["http://172.17.0.1:#{port}", bucket, key]) + arch = to_string(:erlang.system_info(:system_architecture)) + + if String.contains?(arch, "darwin") do + Path.join(["http://#{minio.ip_address}:9000", bucket, key]) + else + port = minio |> MinioContainer.connection_opts() |> Keyword.fetch!(:port) + Path.join(["http://172.17.0.1:#{port}", bucket, key]) + end end end diff --git a/test/plausible/s3_test.exs b/test/plausible/s3_test.exs new file mode 100644 index 000000000000..7a14b403bb4a --- /dev/null +++ b/test/plausible/s3_test.exs @@ -0,0 +1,4 @@ +defmodule Plausible.S3Test do + use ExUnit.Case, async: true + doctest Plausible.S3, import: true +end From 7b4a23532cf1b20be8ebe9bc0e2d26a75b03e866 Mon Sep 17 00:00:00 2001 From: ruslandoga Date: Tue, 19 Mar 2024 17:17:37 +0800 Subject: [PATCH 2/7] Update lib/plausible/imported/csv_importer.ex Co-authored-by: Adrian Gruntkowski --- lib/plausible/imported/csv_importer.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/plausible/imported/csv_importer.ex b/lib/plausible/imported/csv_importer.ex index 137a2353c4ce..034e95862d6f 100644 --- a/lib/plausible/imported/csv_importer.ex +++ b/lib/plausible/imported/csv_importer.ex @@ -123,7 +123,7 @@ defmodule Plausible.Imported.CSVImporter do start_date = if prev_start_date do - min_date(start_date, prev_start_date) + Enum.min([start_date, prev_start_date], Date) else start_date end From d50d566bb29c9d5b455972fea1ea22633b27c402 Mon Sep 17 00:00:00 2001 From: ruslandoga Date: Tue, 19 Mar 2024 17:17:43 +0800 Subject: [PATCH 3/7] Update lib/plausible/imported/csv_importer.ex Co-authored-by: Adrian Gruntkowski --- lib/plausible/imported/csv_importer.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/plausible/imported/csv_importer.ex b/lib/plausible/imported/csv_importer.ex index 034e95862d6f..d9484b1d0adc 100644 --- a/lib/plausible/imported/csv_importer.ex +++ b/lib/plausible/imported/csv_importer.ex @@ -130,7 +130,7 @@ defmodule Plausible.Imported.CSVImporter do end_date = if prev_end_date do - max_date(end_date, prev_end_date) + Enum.max([end_date, prev_end_date], Date) else end_date end From 11207288a7a2efa60c1f99d1f810ae8b87e168a7 Mon Sep 17 00:00:00 2001 From: ruslandoga <67764432+ruslandoga@users.noreply.github.com> Date: Tue, 19 Mar 2024 17:25:18 +0800 Subject: [PATCH 4/7] drop unused functions --- lib/plausible/imported/csv_importer.ex | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lib/plausible/imported/csv_importer.ex b/lib/plausible/imported/csv_importer.ex index d9484b1d0adc..2d10ca32e98c 100644 --- a/lib/plausible/imported/csv_importer.ex +++ b/lib/plausible/imported/csv_importer.ex @@ -140,14 +140,6 @@ defmodule Plausible.Imported.CSVImporter do defp date_range([], first, last), do: Date.range(first, last) - defp min_date(d1, d2) do - if Date.compare(d1, d2) == :lt, do: d1, else: d2 - end - - defp max_date(d1, d2) do - if Date.compare(d1, d2) == :gt, do: d1, else: d2 - end - @spec parse_date!(String.t()) :: Date.t() defp parse_date!(date) do date |> Timex.parse!("{YYYY}{0M}{0D}") |> NaiveDateTime.to_date() From 91a9505594449c486fa3aecc8415b34c7790013b Mon Sep 17 00:00:00 2001 From: ruslandoga <67764432+ruslandoga@users.noreply.github.com> Date: Tue, 19 Mar 2024 17:35:39 +0800 Subject: [PATCH 5/7] send failure email if there is no data to export --- lib/workers/export_csv.ex | 70 ++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/lib/workers/export_csv.ex b/lib/workers/export_csv.ex index e5c803e06466..a992005cfd2b 100644 --- a/lib/workers/export_csv.ex +++ b/lib/workers/export_csv.ex @@ -25,8 +25,6 @@ defmodule Plausible.Workers.ExportCSV do |> Keyword.replace!(:pool_size, 1) |> Ch.start_link() - # NOTE: what if 1970-01-01? - # NOTE: should we use site.timezone? %Ch.Result{rows: [[min_date, max_date]]} = Ch.query!( ch, @@ -34,37 +32,49 @@ defmodule Plausible.Workers.ExportCSV do %{"site_id" => site_id} ) - download_url = - DBConnection.run( - ch, - fn conn -> - conn - |> Plausible.Exports.stream_archive( - Plausible.Exports.export_queries(site_id, - date_range: Date.range(min_date, max_date), - extname: ".csv" - ), - format: "CSVWithNames" - ) - |> Plausible.S3.export_upload_multipart(s3_bucket, s3_path, s3_config_overrides(args)) - end, - timeout: :infinity + if max_date == ~D[1970-01-01] do + # NOTE: replace with proper Plausible.Email template + Plausible.Mailer.deliver_now!( + Bamboo.Email.new_email( + from: "plausible@email.com", + to: email, + subject: "EXPORT FAILURE", + text_body: "there is nothing to export" + ) ) + else + download_url = + DBConnection.run( + ch, + fn conn -> + conn + |> Plausible.Exports.stream_archive( + Plausible.Exports.export_queries(site_id, + date_range: Date.range(min_date, max_date), + extname: ".csv" + ), + format: "CSVWithNames" + ) + |> Plausible.S3.export_upload_multipart(s3_bucket, s3_path, s3_config_overrides(args)) + end, + timeout: :infinity + ) - # NOTE: replace with proper Plausible.Email template - Plausible.Mailer.deliver_now!( - Bamboo.Email.new_email( - from: "plausible@email.com", - to: email, - subject: "EXPORT SUCCESS", - text_body: """ - download it from #{download_url}! hurry up! you have 24 hours!" - """, - html_body: """ - download it from here! hurry up! you have 24 hours! - """ + # NOTE: replace with proper Plausible.Email template + Plausible.Mailer.deliver_now!( + Bamboo.Email.new_email( + from: "plausible@email.com", + to: email, + subject: "EXPORT SUCCESS", + text_body: """ + download it from #{download_url}! hurry up! you have 24 hours!" + """, + html_body: """ + download it from here! hurry up! you have 24 hours! + """ + ) ) - ) + end :ok end From 2011c66d1dd4e26b8396ee92068e716d497b55e7 Mon Sep 17 00:00:00 2001 From: ruslandoga <67764432+ruslandoga@users.noreply.github.com> Date: Tue, 19 Mar 2024 17:36:50 +0800 Subject: [PATCH 6/7] use PlausibleWeb.Email.mailer_email_from() --- lib/workers/export_csv.ex | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/workers/export_csv.ex b/lib/workers/export_csv.ex index a992005cfd2b..67a6ffc0b74a 100644 --- a/lib/workers/export_csv.ex +++ b/lib/workers/export_csv.ex @@ -36,7 +36,7 @@ defmodule Plausible.Workers.ExportCSV do # NOTE: replace with proper Plausible.Email template Plausible.Mailer.deliver_now!( Bamboo.Email.new_email( - from: "plausible@email.com", + from: PlausibleWeb.Email.mailer_email_from(), to: email, subject: "EXPORT FAILURE", text_body: "there is nothing to export" @@ -63,7 +63,7 @@ defmodule Plausible.Workers.ExportCSV do # NOTE: replace with proper Plausible.Email template Plausible.Mailer.deliver_now!( Bamboo.Email.new_email( - from: "plausible@email.com", + from: PlausibleWeb.Email.mailer_email_from(), to: email, subject: "EXPORT SUCCESS", text_body: """ From f8867780910fa1cd9645f1685aad1ebaae17ea08 Mon Sep 17 00:00:00 2001 From: ruslandoga <67764432+ruslandoga@users.noreply.github.com> Date: Tue, 19 Mar 2024 17:37:17 +0800 Subject: [PATCH 7/7] ensure we get dates from minmax date query --- lib/workers/export_csv.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/workers/export_csv.ex b/lib/workers/export_csv.ex index 67a6ffc0b74a..5b46d4fd5757 100644 --- a/lib/workers/export_csv.ex +++ b/lib/workers/export_csv.ex @@ -25,7 +25,7 @@ defmodule Plausible.Workers.ExportCSV do |> Keyword.replace!(:pool_size, 1) |> Ch.start_link() - %Ch.Result{rows: [[min_date, max_date]]} = + %Ch.Result{rows: [[%Date{} = min_date, %Date{} = max_date]]} = Ch.query!( ch, "SELECT toDate(min(timestamp)), toDate(max(timestamp)) FROM events_v2 WHERE site_id={site_id:UInt64}",