From c14dcb804173e680baef0054df39fd3c45623c25 Mon Sep 17 00:00:00 2001 From: Blayne Chard Date: Tue, 14 Jan 2025 10:36:28 +1300 Subject: [PATCH] feat: improved cloudwatch analytics rollup BM-1092 (#3381) #### Motivation We currently get 10s of millions of requests per day, which are all logged by cloudwatch into a s3 bucket, these logs are very large when looking over the entire history of basemaps, so we rollup the logs into hourly summaries of the requests. The current implementation rolls up the logs into coarse buckets these buckets give a good overview of usage but do not allow us to dig into more detailed usage trends such as which datasets are used in what formats. ### Modification Introduces a new analytic lambda to run side by side with the existing lambda until we are comfortable to turn off the old lambda. The logs are now rolled aggregated on more fields such as - tiles - zoom level - matrix set (and requested matrix set, /3857/ vs /WebMercatorQuad/ ) - extension, png vs pbf - tile set - api key and type --- package-lock.json | 288 +++++++++++++++++- .../_infra/src/analytics/edge.analytics.ts | 35 ++- packages/_infra/src/config.ts | 5 + .../lambda-analytic-cloudfront/CHANGELOG.md | 0 packages/lambda-analytic-cloudfront/README.md | 5 + .../lambda-analytic-cloudfront/package.json | 42 +++ .../src/__test__/analytics.test.ts | 133 ++++++++ .../src/__test__/log.data.ts | 15 + .../lambda-analytic-cloudfront/src/bin.ts | 10 + .../lambda-analytic-cloudfront/src/date.ts | 33 ++ .../lambda-analytic-cloudfront/src/elastic.ts | 91 ++++++ .../lambda-analytic-cloudfront/src/handler.ts | 156 ++++++++++ .../lambda-analytic-cloudfront/src/index.ts | 6 + .../src/log.reader.ts | 167 ++++++++++ .../src/log.stats.ts | 76 +++++ .../src/log/__test__/tile.url.test.ts | 53 ++++ .../src/log/query.ts | 31 ++ .../src/log/referer.ts | 29 ++ .../src/log/tile.url.ts | 87 ++++++ .../src/useragent/__test__/parser.test.ts | 195 ++++++++++++ .../src/useragent/agent.ts | 9 + .../src/useragent/agents/gis.ts | 85 ++++++ .../src/useragent/agents/programming.ts | 25 ++ .../src/useragent/parser.ts | 106 +++++++ .../src/useragent/parser.types.ts | 14 + .../lambda-analytic-cloudfront/tsconfig.json | 10 + .../lambda-analytic-cloudfront/typedoc.json | 4 + packages/shared/src/const.ts | 14 +- 28 files changed, 1705 insertions(+), 19 deletions(-) create mode 100644 packages/lambda-analytic-cloudfront/CHANGELOG.md create mode 100644 packages/lambda-analytic-cloudfront/README.md create mode 100644 packages/lambda-analytic-cloudfront/package.json create mode 100644 packages/lambda-analytic-cloudfront/src/__test__/analytics.test.ts create mode 100644 packages/lambda-analytic-cloudfront/src/__test__/log.data.ts create mode 100644 packages/lambda-analytic-cloudfront/src/bin.ts create mode 100644 packages/lambda-analytic-cloudfront/src/date.ts create mode 100644 packages/lambda-analytic-cloudfront/src/elastic.ts create mode 100644 packages/lambda-analytic-cloudfront/src/handler.ts create mode 100644 packages/lambda-analytic-cloudfront/src/index.ts create mode 100644 packages/lambda-analytic-cloudfront/src/log.reader.ts create mode 100644 packages/lambda-analytic-cloudfront/src/log.stats.ts create mode 100644 packages/lambda-analytic-cloudfront/src/log/__test__/tile.url.test.ts create mode 100644 packages/lambda-analytic-cloudfront/src/log/query.ts create mode 100644 packages/lambda-analytic-cloudfront/src/log/referer.ts create mode 100644 packages/lambda-analytic-cloudfront/src/log/tile.url.ts create mode 100644 packages/lambda-analytic-cloudfront/src/useragent/__test__/parser.test.ts create mode 100644 packages/lambda-analytic-cloudfront/src/useragent/agent.ts create mode 100644 packages/lambda-analytic-cloudfront/src/useragent/agents/gis.ts create mode 100644 packages/lambda-analytic-cloudfront/src/useragent/agents/programming.ts create mode 100644 packages/lambda-analytic-cloudfront/src/useragent/parser.ts create mode 100644 packages/lambda-analytic-cloudfront/src/useragent/parser.types.ts create mode 100644 packages/lambda-analytic-cloudfront/tsconfig.json create mode 100644 packages/lambda-analytic-cloudfront/typedoc.json diff --git a/package-lock.json b/package-lock.json index 0921bb1be..bbea109fa 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1427,6 +1427,10 @@ "resolved": "packages/_infra", "link": true }, + "node_modules/@basemaps/lambda-analytic-cloudfront": { + "resolved": "packages/lambda-analytic-cloudfront", + "link": true + }, "node_modules/@basemaps/lambda-analytics": { "resolved": "packages/lambda-analytics", "link": true @@ -1614,6 +1618,41 @@ "streamx": "^2.15.0" } }, + "node_modules/@elastic/elasticsearch": { + "version": "8.16.2", + "resolved": "https://registry.npmjs.org/@elastic/elasticsearch/-/elasticsearch-8.16.2.tgz", + "integrity": "sha512-2ivc6uS97fbEeW4tNtg5mvh/Jy82ZLfcwQ1HhNhdYxyapNnQxIgZ83Zd8Ir+5jCPMDWKSYgwDb8t4GAINDDv2w==", + "dependencies": { + "@elastic/transport": "^8.9.1", + "apache-arrow": "^18.0.0", + "tslib": "^2.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@elastic/transport": { + "version": "8.9.1", + "resolved": "https://registry.npmjs.org/@elastic/transport/-/transport-8.9.1.tgz", + "integrity": "sha512-jasKNQeOb1vNf9aEYg+8zXmetaFjApDTSCC4QTl6aTixvyiRiSLcCiB8P6Q0lY9JIII/BhqNl8WbpFnsKitntw==", + "dependencies": { + "@opentelemetry/api": "1.x", + "debug": "^4.3.4", + "hpagent": "^1.0.0", + "ms": "^2.1.3", + "secure-json-parse": "^2.4.0", + "tslib": "^2.4.0", + "undici": "^6.12.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@elastic/transport/node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" + }, "node_modules/@emnapi/runtime": { "version": "0.44.0", "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-0.44.0.tgz", @@ -3949,6 +3988,14 @@ "@octokit/openapi-types": "^18.0.0" } }, + "node_modules/@opentelemetry/api": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", + "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", + "engines": { + "node": ">=8.0.0" + } + }, "node_modules/@pkgjs/parseargs": { "version": "0.11.0", "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz", @@ -5181,12 +5228,11 @@ } }, "node_modules/@swc/helpers": { - "version": "0.5.3", - "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.3.tgz", - "integrity": "sha512-FaruWX6KdudYloq1AHD/4nU+UsMTdNE8CKyrseXWEcgjDAbvkwJg2QGPAnfIJLIWsjZOSPLOAykK6fuYp4vp4A==", - "dev": true, + "version": "0.5.15", + "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.15.tgz", + "integrity": "sha512-JQ5TuMi45Owi4/BIMAJBoSQoOJu12oOk/gADqlcUL9JEdHB8vyjUSsxqeNXnmXHjYKMi2WcYtezGEEhqUI/E2g==", "dependencies": { - "tslib": "^2.4.0" + "tslib": "^2.8.0" } }, "node_modules/@szhsin/react-menu": { @@ -5277,6 +5323,16 @@ "integrity": "sha512-IWmFpqnVDvskYWnNSiu/qlRn80XlIOU0Gy5rKCl/NjhnI95pV8qIHs6L5b+bpHhyzuOSzjLgBcwgFSXrC1nZWA==", "dev": true }, + "node_modules/@types/command-line-args": { + "version": "5.2.3", + "resolved": "https://registry.npmjs.org/@types/command-line-args/-/command-line-args-5.2.3.tgz", + "integrity": "sha512-uv0aG6R0Y8WHZLTamZwtfsDLVRnOa+n+n5rEvFWL5Na5gZ8V2Teab/duDPFzIIIhs9qizDpcavCusCLJZu62Kw==" + }, + "node_modules/@types/command-line-usage": { + "version": "5.0.4", + "resolved": "https://registry.npmjs.org/@types/command-line-usage/-/command-line-usage-5.0.4.tgz", + "integrity": "sha512-BwR5KP3Es/CSht0xqBcUXS3qCAUVXwpRKsV2+arxeb65atasuXG9LykC9Ab10Cw3s2raH92ZqOeILaQbsB2ACg==" + }, "node_modules/@types/geojson": { "version": "7946.0.15", "resolved": "https://registry.npmjs.org/@types/geojson/-/geojson-7946.0.15.tgz", @@ -5319,7 +5375,6 @@ "version": "20.14.8", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.8.tgz", "integrity": "sha512-DO+2/jZinXfROG7j7WKFn/3C6nFwxy2lLpgLjEXJz+0XKphZlTLJ14mo8Vfg8X5BWN6XjyESXq+LcYdT7tR3bA==", - "dev": true, "dependencies": { "undici-types": "~5.26.4" } @@ -5890,6 +5945,25 @@ "integrity": "sha512-rr+OQyAjxze7GgWrSaJwydHStIhHq2lvY3BOC2Mj7KnzI7XK0Uw1TOOdI9lDoajEbSWLiYgoo4f1R51erQfhPQ==", "license": "MIT" }, + "node_modules/apache-arrow": { + "version": "18.1.0", + "resolved": "https://registry.npmjs.org/apache-arrow/-/apache-arrow-18.1.0.tgz", + "integrity": "sha512-v/ShMp57iBnBp4lDgV8Jx3d3Q5/Hac25FWmQ98eMahUiHPXcvwIMKJD0hBIgclm/FCG+LwPkAKtkRO1O/W0YGg==", + "dependencies": { + "@swc/helpers": "^0.5.11", + "@types/command-line-args": "^5.2.3", + "@types/command-line-usage": "^5.0.4", + "@types/node": "^20.13.0", + "command-line-args": "^5.2.1", + "command-line-usage": "^7.0.1", + "flatbuffers": "^24.3.25", + "json-bignum": "^0.0.3", + "tslib": "^2.6.2" + }, + "bin": { + "arrow2csv": "bin/arrow2csv.js" + } + }, "node_modules/aproba": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/aproba/-/aproba-2.0.0.tgz", @@ -5924,6 +5998,14 @@ "sprintf-js": "~1.0.2" } }, + "node_modules/array-back": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/array-back/-/array-back-3.1.0.tgz", + "integrity": "sha512-TkuxA4UCOvxuDK6NZYXCalszEzj+TLszyASooky+i742l9TqsOdYCMJJupxRic61hwquNtppB3hgcuq9SVSH1Q==", + "engines": { + "node": ">=6" + } + }, "node_modules/array-buffer-byte-length": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.1.tgz", @@ -7000,6 +7082,35 @@ "url": "https://github.com/chalk/chalk?sponsor=1" } }, + "node_modules/chalk-template": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/chalk-template/-/chalk-template-0.4.0.tgz", + "integrity": "sha512-/ghrgmhfY8RaSdeo43hNXxpoHAtxdbskUHjPpfqUWGttFgycUhYPGx3YZBCnUCvOa7Doivn1IZec3DEGFoMgLg==", + "dependencies": { + "chalk": "^4.1.2" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/chalk-template?sponsor=1" + } + }, + "node_modules/chalk-template/node_modules/chalk": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", + "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", + "dependencies": { + "ansi-styles": "^4.1.0", + "supports-color": "^7.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" + } + }, "node_modules/chardet": { "version": "0.7.0", "resolved": "https://registry.npmjs.org/chardet/-/chardet-0.7.0.tgz", @@ -7279,6 +7390,50 @@ "node": ">= 0.8" } }, + "node_modules/command-line-args": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/command-line-args/-/command-line-args-5.2.1.tgz", + "integrity": "sha512-H4UfQhZyakIjC74I9d34fGYDwk3XpSr17QhEd0Q3I9Xq1CETHo4Hcuo87WyWHpAF1aSLjLRf5lD9ZGX2qStUvg==", + "dependencies": { + "array-back": "^3.1.0", + "find-replace": "^3.0.0", + "lodash.camelcase": "^4.3.0", + "typical": "^4.0.0" + }, + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/command-line-usage": { + "version": "7.0.3", + "resolved": "https://registry.npmjs.org/command-line-usage/-/command-line-usage-7.0.3.tgz", + "integrity": "sha512-PqMLy5+YGwhMh1wS04mVG44oqDsgyLRSKJBdOo1bnYhMKBW65gZF1dRp2OZRhiTjgUHljy99qkO7bsctLaw35Q==", + "dependencies": { + "array-back": "^6.2.2", + "chalk-template": "^0.4.0", + "table-layout": "^4.1.0", + "typical": "^7.1.1" + }, + "engines": { + "node": ">=12.20.0" + } + }, + "node_modules/command-line-usage/node_modules/array-back": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/array-back/-/array-back-6.2.2.tgz", + "integrity": "sha512-gUAZ7HPyb4SJczXAMUXMGAvI976JoK3qEx9v1FTmeYuJj0IBiaKttG1ydtGKdkfqWkIkouke7nG8ufGy77+Cvw==", + "engines": { + "node": ">=12.17" + } + }, + "node_modules/command-line-usage/node_modules/typical": { + "version": "7.3.0", + "resolved": "https://registry.npmjs.org/typical/-/typical-7.3.0.tgz", + "integrity": "sha512-ya4mg/30vm+DOWfBg4YK3j2WD6TWtRkCbasOJr40CseYENzCUby/7rIvXA99JGsQHeNxLbnXdyLLxKSv3tauFw==", + "engines": { + "node": ">=12.17" + } + }, "node_modules/commander": { "version": "2.20.3", "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz", @@ -9749,6 +9904,17 @@ "node": ">=14" } }, + "node_modules/find-replace": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/find-replace/-/find-replace-3.0.0.tgz", + "integrity": "sha512-6Tb2myMioCAgv5kfvP5/PkZZ/ntTpVK39fHY7WkWBgvbeE+VHd/tZuZ4mrC+bxh4cfOZeYKVPaJIZtZXV7GNCQ==", + "dependencies": { + "array-back": "^3.0.1" + }, + "engines": { + "node": ">=4.0.0" + } + }, "node_modules/find-root": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/find-root/-/find-root-1.1.0.tgz", @@ -9823,6 +9989,11 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/flatbuffers": { + "version": "24.3.25", + "resolved": "https://registry.npmjs.org/flatbuffers/-/flatbuffers-24.3.25.tgz", + "integrity": "sha512-3HDgPbgiwWMI9zVB7VYBHaMrbOO7Gm0v+yD2FV/sCKj+9NDeVL7BOBYUuhWAQGKWOzBo8S9WdMvV0eixO233XQ==" + }, "node_modules/flatted": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.1.tgz", @@ -11071,6 +11242,14 @@ "node": ">=10" } }, + "node_modules/hpagent": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/hpagent/-/hpagent-1.2.0.tgz", + "integrity": "sha512-A91dYTeIB6NoXG+PxTQpCCDDnfHsW9kc06Lvpu1TEe9gnd6ZFeiBoRO9JvzEv6xK7EX97/dUE8g/vBMTqTS3CA==", + "engines": { + "node": ">=14" + } + }, "node_modules/http-cache-semantics": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.1.1.tgz", @@ -12140,6 +12319,14 @@ "dev": true, "license": "Python-2.0" }, + "node_modules/json-bignum": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/json-bignum/-/json-bignum-0.0.3.tgz", + "integrity": "sha512-2WHyXj3OfHSgNyuzDbSxI1w2jgw5gkWSWhS7Qg4bWXx1nLk3jnbwfUeS0PSba3IzpTUWdHxBieELUzXRjQB2zg==", + "engines": { + "node": ">=0.8" + } + }, "node_modules/json-buffer": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.0.tgz", @@ -12994,6 +13181,11 @@ "dev": true, "license": "MIT" }, + "node_modules/lodash.camelcase": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/lodash.camelcase/-/lodash.camelcase-4.3.0.tgz", + "integrity": "sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA==" + }, "node_modules/lodash.get": { "version": "4.4.2", "resolved": "https://registry.npmjs.org/lodash.get/-/lodash.get-4.4.2.tgz", @@ -16824,8 +17016,7 @@ "node_modules/secure-json-parse": { "version": "2.7.0", "resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz", - "integrity": "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==", - "dev": true + "integrity": "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==" }, "node_modules/semver": { "version": "7.5.3", @@ -17862,6 +18053,26 @@ "url": "https://opencollective.com/unts" } }, + "node_modules/table-layout": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/table-layout/-/table-layout-4.1.1.tgz", + "integrity": "sha512-iK5/YhZxq5GO5z8wb0bY1317uDF3Zjpha0QFFLA8/trAoiLbQD0HUbMesEaxyzUgDxi2QlcbM8IvqOlEjgoXBA==", + "dependencies": { + "array-back": "^6.2.2", + "wordwrapjs": "^5.1.0" + }, + "engines": { + "node": ">=12.17" + } + }, + "node_modules/table-layout/node_modules/array-back": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/array-back/-/array-back-6.2.2.tgz", + "integrity": "sha512-gUAZ7HPyb4SJczXAMUXMGAvI976JoK3qEx9v1FTmeYuJj0IBiaKttG1ydtGKdkfqWkIkouke7nG8ufGy77+Cvw==", + "engines": { + "node": ">=12.17" + } + }, "node_modules/tar": { "version": "6.1.11", "resolved": "https://registry.npmjs.org/tar/-/tar-6.1.11.tgz", @@ -18137,9 +18348,9 @@ } }, "node_modules/tslib": { - "version": "2.6.2", - "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.2.tgz", - "integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==" + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==" }, "node_modules/tuf-js": { "version": "1.1.7", @@ -18506,10 +18717,18 @@ "node": ">=14.17" } }, + "node_modules/typical": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/typical/-/typical-4.0.0.tgz", + "integrity": "sha512-VAH4IvQ7BDFYglMd7BPRDfLgxZZX4O4TFcRDA6EN5X7erNJJq+McIEp8np9aVtxrCJ6qx4GTYVfOWNjcqwZgRw==", + "engines": { + "node": ">=8" + } + }, "node_modules/ua-parser-js": { - "version": "1.0.37", - "resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-1.0.37.tgz", - "integrity": "sha512-bhTyI94tZofjo+Dn8SN6Zv8nBDvyXTymAdM3LDI/0IboIUwTu1rEhW7v2TfiVsoYWgkQ4kOVqnI8APUFbIQIFQ==", + "version": "1.0.39", + "resolved": "https://registry.npmjs.org/ua-parser-js/-/ua-parser-js-1.0.39.tgz", + "integrity": "sha512-k24RCVWlEcjkdOxYmVJgeD/0a1TiSpqLg+ZalVGV9lsnr4yqu0w7tX/x2xX6G4zpkgQnRf89lxuZ1wsbjXM8lw==", "funding": [ { "type": "opencollective", @@ -18524,6 +18743,9 @@ "url": "https://github.com/sponsors/faisalman" } ], + "bin": { + "ua-parser-js": "script/cli.js" + }, "engines": { "node": "*" } @@ -18569,11 +18791,18 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/undici": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.21.0.tgz", + "integrity": "sha512-BUgJXc752Kou3oOIuU1i+yZZypyZRqNPW0vqoMPl8VaoalSfeR0D8/t4iAS3yirs79SSMTxTag+ZC86uswv+Cw==", + "engines": { + "node": ">=18.17" + } + }, "node_modules/undici-types": { "version": "5.26.5", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", - "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", - "dev": true + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==" }, "node_modules/unique-filename": { "version": "2.0.1", @@ -18895,6 +19124,14 @@ "dev": true, "license": "MIT" }, + "node_modules/wordwrapjs": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/wordwrapjs/-/wordwrapjs-5.1.0.tgz", + "integrity": "sha512-JNjcULU2e4KJwUNv6CHgI46UvDGitb6dGryHajXTDiLgg1/RiGoPSDw4kZfYnwGtEXf2ZMeIewDQgFGzkCB2Sg==", + "engines": { + "node": ">=12.17" + } + }, "node_modules/wrap-ansi": { "version": "6.2.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-6.2.0.tgz", @@ -19332,6 +19569,25 @@ "dev": true, "license": "MIT" }, + "packages/lambda-analytic-cloudfront": { + "name": "@basemaps/lambda-analytic-cloudfront", + "version": "7.11.0", + "license": "MIT", + "dependencies": { + "@basemaps/config": "^7.11.0", + "@basemaps/geo": "^7.11.0", + "@basemaps/shared": "^7.11.0", + "@elastic/elasticsearch": "^8.16.2", + "@linzjs/lambda": "^4.0.0", + "ua-parser-js": "^1.0.39" + }, + "devDependencies": { + "@types/ua-parser-js": "^0.7.36" + }, + "engines": { + "node": ">=16.0.0" + } + }, "packages/lambda-analytics": { "name": "@basemaps/lambda-analytics", "version": "7.12.0", diff --git a/packages/_infra/src/analytics/edge.analytics.ts b/packages/_infra/src/analytics/edge.analytics.ts index 61558f54c..977ba7182 100644 --- a/packages/_infra/src/analytics/edge.analytics.ts +++ b/packages/_infra/src/analytics/edge.analytics.ts @@ -7,7 +7,10 @@ import { RetentionDays } from 'aws-cdk-lib/aws-logs'; import { BlockPublicAccess, Bucket } from 'aws-cdk-lib/aws-s3'; import { Construct } from 'constructs'; -const CODE_PATH = '../lambda-analytics/dist'; +import { getConfig } from '../config.js'; + +const CodePath = '../lambda-analytics/dist'; +const CodePathV2 = '../lambda-analytics-cloudfront/dist'; export interface EdgeAnalyticsProps extends StackProps { distributionId: string; @@ -36,7 +39,7 @@ export class EdgeAnalytics extends Stack { memorySize: 2048, timeout: Duration.minutes(10), handler: 'index.handler', - code: lambda.Code.fromAsset(CODE_PATH), + code: lambda.Code.fromAsset(CodePath), environment: { [Env.Analytics.CloudFrontId]: distributionId, [Env.Analytics.CacheBucket]: `s3://${cacheBucket.bucketName}`, @@ -53,5 +56,33 @@ export class EdgeAnalytics extends Stack { // Run this lambda function every hour const rule = new Rule(this, 'AnalyticRule', { schedule: Schedule.rate(Duration.hours(1)) }); rule.addTarget(new LambdaFunction(this.lambda)); + + const v2Lambda = new lambda.Function(this, 'AnalyticV2Lambda', { + runtime: lambda.Runtime.NODEJS_LATEST, + memorySize: 2048, + timeout: Duration.minutes(10), + handler: 'index.handler', + code: lambda.Code.fromAsset(CodePathV2), + environment: { + [Env.Analytics.CloudFrontId]: distributionId, + [Env.Analytics.CacheBucket]: `s3://${cacheBucket.bucketName}`, + [Env.Analytics.CloudFrontSourceBucket]: `s3://${logBucket.bucketName}`, + [Env.Analytics.MaxRecords]: String(24 * 7 * 4), + [Env.Analytics.ElasticId]: Env.get(Env.Analytics.ElasticId) ?? '', + [Env.Analytics.ElasticApiKey]: Env.get(Env.Analytics.ElasticApiKey) ?? '', + [Env.Analytics.ElasticIndexName]: getConfig().ElasticHistoryIndexName, + AWS_NODEJS_CONNECTION_REUSE_ENABLED: '1', + }, + logRetention: RetentionDays.ONE_MONTH, + loggingFormat: lambda.LoggingFormat.JSON, + }); + + cacheBucket.grantReadWrite(v2Lambda); + logBucket.grantRead(v2Lambda); + + // Run this lambda function every hour + new Rule(this, 'AnalyticV2Rule', { schedule: Schedule.rate(Duration.hours(1)) }).addTarget( + new LambdaFunction(v2Lambda), + ); } } diff --git a/packages/_infra/src/config.ts b/packages/_infra/src/config.ts index c82ab41a6..91662fb55 100644 --- a/packages/_infra/src/config.ts +++ b/packages/_infra/src/config.ts @@ -12,6 +12,9 @@ export interface BaseMapsConfig { /** AWS role config bucket */ AwsRoleConfigBucket: string; + + /** Elastic Index to use for basemaps history data */ + ElasticHistoryIndexName: string; } export const BaseMapsProdConfig: BaseMapsConfig = { @@ -26,6 +29,7 @@ export const BaseMapsProdConfig: BaseMapsConfig = { CloudFrontDns: ['basemaps.linz.govt.nz', 'tiles.basemaps.linz.govt.nz'], PublicUrlBase: 'https://basemaps.linz.govt.nz', AwsRoleConfigBucket: 'linz-bucket-config', + ElasticHistoryIndexName: 'basemaps-history', }; export const BaseMapsDevConfig: BaseMapsConfig = { @@ -33,6 +37,7 @@ export const BaseMapsDevConfig: BaseMapsConfig = { CloudFrontDns: ['dev.basemaps.linz.govt.nz', 'tiles.dev.basemaps.linz.govt.nz'], PublicUrlBase: 'https://dev.basemaps.linz.govt.nz', AwsRoleConfigBucket: 'linz-bucket-config', + ElasticHistoryIndexName: 'nonprod-basemaps-history', }; /** Is this deployment intended for production */ export const IsProduction = process.env['NODE_ENV'] === 'production'; diff --git a/packages/lambda-analytic-cloudfront/CHANGELOG.md b/packages/lambda-analytic-cloudfront/CHANGELOG.md new file mode 100644 index 000000000..e69de29bb diff --git a/packages/lambda-analytic-cloudfront/README.md b/packages/lambda-analytic-cloudfront/README.md new file mode 100644 index 000000000..7626825f0 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/README.md @@ -0,0 +1,5 @@ +# @basemaps/lambda-analytic-cloudfront + +Generate analytics from CloudFront distribution statistics + +Every hour this lambda function runs and generates a rolled up summary of usage by API Key diff --git a/packages/lambda-analytic-cloudfront/package.json b/packages/lambda-analytic-cloudfront/package.json new file mode 100644 index 000000000..6d9998240 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/package.json @@ -0,0 +1,42 @@ +{ + "name": "@basemaps/lambda-analytic-cloudfront", + "version": "7.11.0", + "private": true, + "repository": { + "type": "git", + "url": "https://github.com/linz/basemaps.git", + "directory": "packages/lambda-analytic-cloudfront" + }, + "author": { + "name": "Land Information New Zealand", + "url": "https://linz.govt.nz", + "organization": true + }, + "type": "module", + "engines": { + "node": ">=16.0.0" + }, + "license": "MIT", + "dependencies": { + "@basemaps/config": "^7.11.0", + "@basemaps/geo": "^7.11.0", + "@basemaps/shared": "^7.11.0", + "@elastic/elasticsearch": "^8.16.2", + "@linzjs/lambda": "^4.0.0", + "ua-parser-js": "^1.0.39" + }, + "scripts": { + "test": "node --test", + "bundle": "../../scripts/bundle.mjs package.json" + }, + "devDependencies": { + "@types/ua-parser-js": "^0.7.36" + }, + "bundle": { + "entry": "src/index.ts", + "outdir": "dist/", + "external": [ + "pino-pretty" + ] + } +} diff --git a/packages/lambda-analytic-cloudfront/src/__test__/analytics.test.ts b/packages/lambda-analytic-cloudfront/src/__test__/analytics.test.ts new file mode 100644 index 000000000..f31da1ba8 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/__test__/analytics.test.ts @@ -0,0 +1,133 @@ +import assert from 'node:assert'; +import { afterEach, beforeEach, describe, it, TestContext } from 'node:test'; +import { gzipSync } from 'node:zlib'; + +import { Env, fsa, FsMemory, LogConfig } from '@basemaps/shared'; +import { Client } from '@elastic/elasticsearch'; +import { LambdaRequest } from '@linzjs/lambda'; +import { Context } from 'aws-lambda'; + +import { getYesterday } from '../date.js'; +import { Elastic } from '../elastic.js'; +import { main } from '../handler.js'; +import { LogStats } from '../log.stats.js'; +import { LogData } from './log.data.js'; + +interface IndexOperation { + index: { _index: string }; +} +type BulkOperation = (IndexOperation | LogStats)[]; + +export class FakeLambdaRequest extends LambdaRequest { + constructor() { + super({}, {} as Context, LogConfig.get()); + } +} + +describe('analytic lambda', () => { + const memory = new FsMemory(); + beforeEach(() => { + fsa.register('mem://', memory); + memory.files.clear(); + + Elastic.indexDelay = 1; // do not wait between requests + Elastic.minRequestCount = 0; // index everything + Elastic._client = undefined; + LogConfig.get().level = 'silent'; + }); + + afterEach(() => { + LogConfig.get().level = 'info'; + }); + + function setupEnv(t: TestContext): void { + t.mock.method(Env, 'get', (key: string): string => { + switch (key) { + case Env.Analytics.CacheBucket: + return 'mem://cache/'; + case Env.Analytics.CloudFrontSourceBucket: + return 'mem://source/'; + case Env.Analytics.CloudFrontId: + return 'cfid'; + case Env.Analytics.MaxRecords: + return '1'; + } + throw new Error(`Invalid test process.env access ${key}`); + }); + } + + it('should process some log data', async (t) => { + setupEnv(t); + + const operations: BulkOperation[] = []; + Elastic._client = { + bulk(op: { operations: BulkOperation }) { + operations.push(op.operations); + return Promise.resolve({}); + }, + } as unknown as Client; + + const YesterDay = getYesterday(); + const shortDate = YesterDay.toISOString().slice(0, 13).replace('T', '-'); + + await fsa.write(new URL(`mem://source/cfid.${shortDate}/data.txt.gz`), gzipSync(LogData)); + + await main(new FakeLambdaRequest()); + + // One call to insert + assert.equal(operations.length, 1); + + const op = operations[0]; + + const indexOpt = op[0] as IndexOperation; + const logOpt = op[1] as LogStats; + + // First Log line: /v1/tiles/aerial/EPSG:3857/19/516588/320039.webp + assert.equal(indexOpt.index._index, 'basemaps-history-2020'); + assert.equal(logOpt.apiType, 'd'); + assert.equal(logOpt.tileMatrix, 'EPSG:3857'); + assert.equal(logOpt.tileMatrixId, 'WebMercatorQuad'); + assert.equal(logOpt.tileSet, 'aerial'); + assert.equal(logOpt.z, 19); + assert.equal(logOpt.cacheHit, 1); + assert.equal(logOpt.cacheMiss, 0); + assert.equal(logOpt.total, 1); + + assert.deepEqual(logOpt.ua, { os: 'linux', name: 'chrome', version: '85', variant: 'unknown' }); + + const files = [...memory.files.keys()]; + assert.equal(files.length, 2); // two files one input one output + + assert.equal( + files[1], + `mem://cache/RollUpV3/${shortDate.slice(0, 4)}/${shortDate.slice(5, 7)}/${shortDate}.ndjson.gz`, + ); + }); + + it('should write errors to storage', async (t) => { + setupEnv(t); + + Elastic._client = { + bulk() { + return Promise.resolve({ errors: ['Hello'] }); + }, + } as unknown as Client; + + const YesterDay = getYesterday(); + const shortDate = YesterDay.toISOString().slice(0, 13).replace('T', '-'); + + await fsa.write(new URL(`mem://source/cfid.${shortDate}/data.txt.gz`), gzipSync(LogData)); + + const ret = await main(new FakeLambdaRequest()).catch((e: Error) => e); + + assert.equal(String(ret), 'Error: Failed to index'); + + const files = [...memory.files.keys()]; + assert.equal(files.length, 2); // two files one input one output + + assert.ok(files[1].startsWith(`mem://cache/errors-${new Date().toISOString().slice(0, 12)}`)); + + const data = await fsa.read(new URL(files[1])); + assert.ok(data.toString().includes(JSON.stringify('Hello'))); + }); +}); diff --git a/packages/lambda-analytic-cloudfront/src/__test__/log.data.ts b/packages/lambda-analytic-cloudfront/src/__test__/log.data.ts new file mode 100644 index 000000000..db0a049da --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/__test__/log.data.ts @@ -0,0 +1,15 @@ +import { ulid } from 'ulid'; + +export const DevApiKey = 'd' + ulid().toLowerCase(); +export const ClientApiKey = 'c' + ulid().toLowerCase(); + +export const LogData = `#Version: 1.0 +#Fields: date time x-edge-location sc-bytes c-ip cs-method cs(Host) cs-uri-stem sc-status cs(Referer) cs(User-Agent) cs-uri-query cs(Cookie) x-edge-result-type x-edge-request-id x-host-header cs-protocol cs-bytes time-taken x-forwarded-for ssl-protocol ssl-cipher x-edge-response-result-type cs-protocol-version fle-status fle-encrypted-fields c-port time-to-first-byte x-edge-detailed-result-type sc-content-type sc-content-len sc-range-start sc-range-end +2020-07-28 01:11:25 AKL50-C1 20753 255.255.255.141 GET d1mez8rta20vo0.cloudfront.net /v1/tiles/aerial/EPSG:3857/19/516588/320039.webp 200 https://bar.com/ Mozilla/5.0%20(X11;%20Linux%20x86_64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/85.0.4183.101%20Safari/537.36 api=${DevApiKey} - Hit sBUoz03SwR_hVZkdj0LVC1s_bKakd9ONcKTYRrQLuIR3VPBQUx5xog== basemaps.linz.govt.nz https 82 0.049 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 -- 21780 0.049 Hit image/webp 20320 - - +2020-07-28 01:16:13 SYD1-C2 156474 255.255.255.4 GET d1mez8rta20vo0.cloudfront.net /v1/tiles/aerial/NZTM2000Quad/19/516542/319785.png 200 https://www.bar.com/ Mozilla/5.0%20(Macintosh;%20Intel%20Mac%20OS%20X%2010_15_4)%20AppleWebKit/605.1.15%20(KHTML,%20like%20Gecko)%20Version/13.1.2%20Safari/605.1.15 api=${DevApiKey}&foo=bar - Hit 9KNnEESjZA-yVs62ffwtRYNaa0gpYKLeEEHH490dmO7AAu3ZxnPc8Q== basemaps.linz.govt.nz https 77 1.791 - TLSv1.3 TLS_AES_128_GCM_SHA256 Hit HTTP/2.0 - - 19468 0.028 Hit image/png 155886 - - +2020-07-28 01:16:21 SYD1-C2 21223 255.255.255.73 GET d1mez8rta20vo0.cloudfront.net /v1/tiles/topo50/3857/18/257866/162011.jpeg 200 https://bar.com/map/ Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/85.0.4183.102%20Safari/537.36 api=${DevApiKey} - Miss a5nrTCsdsP5EDQ9EXkUQQJMCJTlbRUz5JIxowZ-1kRriRDUmLPxvVQ== basemaps.linz.govt.nz https 76 0.222 - TLSv1.3 TLS_AES_128_GCM_SHA256 Miss HTTP/2.0 - - 57799 0.222 Miss image/jpeg 20797 - - +2020-07-28 01:13:33 SYD4-C2 2588 255.255.255.128 GET d1mez8rta20vo0.cloudfront.net /v1/tiles/topo50/EPSG:3857/WMTSCapabilities.xml 200 - Mozilla/5.0%20QGIS/31006 api=${ClientApiKey} - RefreshHit oflBr-vO5caoVpi2S23hGh9YWMUca-McU_Fl5oN9fqW_H9ea_iS-Kg== basemaps.linz.govt.nz https 243 0.051 - TLSv1.2 ECDHE-RSA-AES128-GCM-SHA256 RefreshHit HTTP/1.1 - - 55515 0.050 RefreshHit text/xml - +2020-07-28 01:13:33 SYD4-C2 2588 255.255.255.128 GET d1mez8rta20vo0.cloudfront.net /v1/tiles/topo50/EPSG:2193/18/257866/162011.pbf 200 - Mozilla/5.0%20QGIS/31006 api=${ClientApiKey} - RefreshHit oflBr-vO5caoVpi2S23hGh9YWMUca-McU_Fl5oN9fqW_H9ea_iS-Kg== basemaps.linz.govt.nz https 243 0.051 - TLSv1.2 ECDHE-RSA-AES128-GCM-SHA256 RefreshHit HTTP/1.1 - - 55515 0.050 RefreshHit text/xml - +2020-07-28 01:13:33 SYD4-C2 2588 255.255.255.128 GET d1mez8rta20vo0.cloudfront.net /v1/tiles/antipodes-islands-satellite-2019-2020-0.5m/NZTM2000Quad/18/257866/162011.webp 200 - Mozilla/5.0%20QGIS/31006 api=${ClientApiKey} - RefreshHit oflBr-vO5caoVpi2S23hGh9YWMUca-McU_Fl5oN9fqW_H9ea_iS-Kg== basemaps.linz.govt.nz https 243 0.051 - TLSv1.2 ECDHE-RSA-AES128-GCM-SHA256 RefreshHit HTTP/1.1 - - 55515 0.050 RefreshHit text/xml - +2020-07-28 01:13:33 SYD4-C2 2588 255.255.255.128 GET d1mez8rta20vo0.cloudfront.net /v1/tiles/elevation/WebMercatorQuad/18/257866/162011.png 200 - Mozilla/5.0%20QGIS/31006 api=${ClientApiKey}&pipeline=terrain-rgb - RefreshHit oflBr-vO5caoVpi2S23hGh9YWMUca-McU_Fl5oN9fqW_H9ea_iS-Kg== basemaps.linz.govt.nz https 243 0.051 - TLSv1.2 ECDHE-RSA-AES128-GCM-SHA256 RefreshHit HTTP/1.1 - - 55515 0.050 RefreshHit text/xml - +`.trim(); diff --git a/packages/lambda-analytic-cloudfront/src/bin.ts b/packages/lambda-analytic-cloudfront/src/bin.ts new file mode 100644 index 000000000..88d358e35 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/bin.ts @@ -0,0 +1,10 @@ +import { LogConfig } from '@basemaps/shared'; +import { LambdaRequest } from '@linzjs/lambda'; +import { Context } from 'aws-lambda'; + +import { main } from './handler.js'; + +/** + * Manually run the lambda function, this can be helpful for debugging the analytic roll up process + */ +main(new LambdaRequest(null, {} as Context, LogConfig.get())).catch((e) => console.error(e)); diff --git a/packages/lambda-analytic-cloudfront/src/date.ts b/packages/lambda-analytic-cloudfront/src/date.ts new file mode 100644 index 000000000..a262e8ad9 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/date.ts @@ -0,0 +1,33 @@ +export function getYesterday(): Date { + // Process up to about a day ago + const maxDate = new Date(); + maxDate.setUTCMinutes(0); + maxDate.setUTCSeconds(0); + maxDate.setUTCMilliseconds(0); + maxDate.setUTCDate(maxDate.getUTCDate() - 1); + return maxDate; +} + +export function* byDay(startDate: Date, endDate: Date): Generator { + const currentDate = new Date(startDate); + currentDate.setUTCMinutes(0); + currentDate.setUTCSeconds(0); + currentDate.setUTCMilliseconds(0); + while (true) { + yield currentDate.toISOString().slice(0, 10); + currentDate.setUTCDate(currentDate.getUTCDate() - 1); + if (currentDate.getTime() < endDate.getTime()) break; + } +} + +export function* byMonth(startDate: Date, endDate: Date): Generator { + const currentDate = new Date(startDate); + currentDate.setUTCMinutes(0); + currentDate.setUTCSeconds(0); + currentDate.setUTCMilliseconds(0); + while (true) { + yield currentDate.toISOString().slice(0, 7); + currentDate.setUTCMonth(currentDate.getUTCMonth() - 1); + if (currentDate.getTime() < endDate.getTime()) break; + } +} diff --git a/packages/lambda-analytic-cloudfront/src/elastic.ts b/packages/lambda-analytic-cloudfront/src/elastic.ts new file mode 100644 index 000000000..5f877519d --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/elastic.ts @@ -0,0 +1,91 @@ +import { Env, LogType } from '@basemaps/shared'; +import { Client } from '@elastic/elasticsearch'; + +import { LogStats } from './log.stats.js'; + +export class ElasticClient { + _client: Client | undefined; + /** Between index requests delay this amount */ + indexDelay: number = 200; + + /** + * Do not index analytics for buckets that contain less than this number of total requests + * + * @example + * `1` - drop all requests where total requests <= 1 + * + */ + minRequestCount: number = 1; + + get indexName(): string { + const indexName = Env.get(Env.Analytics.ElasticIndexName); + if (indexName == null) throw new Error(`$${Env.Analytics.ElasticIndexName} is unset`); + return indexName; + } + + get client(): Client { + if (this._client != null) return this._client; + + const id = Env.get(Env.Analytics.ElasticId); + const apiKey = Env.get(Env.Analytics.ElasticApiKey); + if (id == null) throw new Error(`$${Env.Analytics.ElasticId} is unset`); + if (apiKey == null) throw new Error(`$${Env.Analytics.ElasticApiKey} is unset`); + this._client = new Client({ cloud: { id }, auth: { apiKey } }); + return this._client; + } + + errors: unknown[] = []; + insertQueue: Promise = Promise.resolve(); + + async insert(prefix: string, combined: Iterable, log: LogType): Promise { + this.insertQueue = this.insertQueue.then(() => this._doInsert(prefix, combined, log)); + return this.insertQueue; + } + + async _doInsert(prefix: string, combined: Iterable, log: LogType): Promise { + const client = this.client; + let inserts = 0; + let skipHits = 0; + let operations: unknown[] = []; + + const startTime = performance.now(); + + const errors = this.errors; + const indexDelay = this.indexDelay; + const indexName = this.indexName; + + async function doInsert(): Promise { + inserts += operations.length / 2; + log.trace({ prefix, records: operations.length / 2, skipHits, total: inserts }, 'log:ingest'); + const ret = await client.bulk({ operations }); + + if (ret.errors) { + errors.push({ prefix, errors: ret.errors }); + throw new Error('Failed to index: ' + prefix); + } + // Give it a little bit of time to index + await new Promise((r) => setTimeout(r, indexDelay)); + operations = []; + } + + for (const rec of combined) { + // skip over roll ups that are less than + if (rec.total <= this.minRequestCount) { + skipHits++; + continue; + } + operations.push({ index: { _index: indexName + '-' + rec['@timestamp'].slice(0, 4), _id: rec.id } }, rec); + if (operations.length > 50_000) await doInsert(); + } + + if (operations.length > 0) await doInsert(); + + if (inserts > 0) { + log.info({ prefix, skipHits, total: inserts, duration: performance.now() - startTime }, 'log:ingest'); + } else { + log.trace({ prefix }, 'log:ingest:skip'); + } + } +} + +export const Elastic = new ElasticClient(); diff --git a/packages/lambda-analytic-cloudfront/src/handler.ts b/packages/lambda-analytic-cloudfront/src/handler.ts new file mode 100644 index 000000000..1ad03d6b5 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/handler.ts @@ -0,0 +1,156 @@ +import { promisify } from 'node:util'; +import { gzip } from 'node:zlib'; + +import { Env, fsa } from '@basemaps/shared'; +import { LambdaRequest } from '@linzjs/lambda'; +import pLimit from 'p-limit'; +import { basename } from 'path'; + +import { byDay, getYesterday } from './date.js'; +import { Elastic } from './elastic.js'; +import { FileProcess, toFullDate } from './log.reader.js'; +import { LogStats } from './log.stats.js'; + +const gzipPromise = promisify(gzip); + +const OldestDate = new Date('2020-01-01T00:00:00.000Z'); + +/** + * extract a environment variable and parse it as a URL + * + * @throws if the env var is unset or not a URL + * @param env ENV var to lookup + * @returns parsed url from the environment + */ +function getEnvUrl(env: string): URL { + const val = Env.get(env); + if (val == null) throw new Error(`$${env} is unset`); + try { + return fsa.toUrl(val); + } catch (e) { + throw new Error(`$${env} is not a url`); + } +} + +export async function main(req: LambdaRequest): Promise { + const SourceLocation = getEnvUrl(Env.Analytics.CloudFrontSourceBucket); + const CacheLocation = getEnvUrl(Env.Analytics.CacheBucket); + const CloudFrontId = Env.get(Env.Analytics.CloudFrontId); + + const MaxToProcess = Env.getNumber(Env.Analytics.MaxRecords, 24 * 7 * 4); // Process 4 weeks of logs by default + + req.log.info( + { source: SourceLocation.href, cacheLocation: CacheLocation.href, cloudFrontId: CloudFrontId }, + 'log:index:start', + ); + if (CloudFrontId == null) throw new Error(`Missing $${Env.Analytics.CloudFrontId}`); + + // Limit hours to be processed 5 at a time and log files to 5 at a time, which gives upto 25 logs files concurrency + // as often hours are skipped + const hourQ = pLimit(5); + const fileQ = pLimit(5); + + let processedCount = 0; + for (const prefixByDay of byDay(getYesterday(), OldestDate)) { + if (processedCount > MaxToProcess) break; + const todo = []; + + for (let hour = 23; hour >= 0; hour--) { + processedCount++; + if (processedCount > MaxToProcess) break; + const hourOfDay = String(hour).padStart(2, '0'); + const prefix = `${prefixByDay}-${hourOfDay}`; + + const targetDate = new Date(toFullDate(prefixByDay + 'T' + hourOfDay)); + const dateDiff = Date.now() - targetDate.getTime(); + // Do not process anything within a hour of the current time as some logs take a while to propagate into the bucket + if (dateDiff < 60 * 60 * 1000) continue; + + // Create a folder structure of /YYYY/MM/ + const cacheFolderParts = prefix.slice(0, 7).replace('-', '/'); + + const cacheUrl = new URL(`./RollUpV3/${cacheFolderParts}/${prefix}.ndjson.gz`, CacheLocation); + + const promise = hourQ(async () => { + // Cache file exists skip processing + if (await fsa.exists(cacheUrl)) { + req.log.debug({ prefix }, 'log:prefix:skip'); + return; + } + + const startTime = performance.now(); + req.log.trace({ prefix }, 'log:prefix:start'); + const logPrefix = new URL(`${CloudFrontId}.${prefix}`, SourceLocation); + + const stats = new Map(); + + const logFiles = await fsa.toArray(fsa.list(logPrefix)); + if (logFiles.length === 0) { + req.log.info({ prefix }, 'log:prefix:no-files'); + return; + } + + let lines = 0; + let fileCount = 0; + const filePromises = logFiles.map((lf) => { + return fileQ(async () => { + const fileStartTime = performance.now(); + + const fileLines = await FileProcess.process(lf, stats); + req.log.trace( + { + prefix: prefix, + file: basename(lf.pathname), + lines: fileLines, + remaining: logFiles.length - fileCount, + duration: performance.now() - fileStartTime, + }, + 'log:file:done', + ); + lines += fileLines; + fileCount++; + }); + }); + + // Process all the log files + await Promise.all(filePromises); + + // Extract the values + const allStats = [...stats.values()]; + await Elastic.insert(prefix, allStats, req.log); + // Ensure everything is indexed into elasticsearch before writing the cache to disk + await fsa.write(cacheUrl, await gzipPromise(JSON.stringify(allStats))); + + req.log.info( + { + prefix: prefix, + files: logFiles.length, + lines, + records: stats.size, + duration: performance.now() - startTime, + }, + 'log:prefix:done', + ); + }); + + todo.push(promise); + } + + const rets = await Promise.allSettled(todo); + + // If anything fails to index write the errors out to a log file at the cache location + if (Elastic.errors.length > 0) { + const errorLocation = new URL(`./errors-${new Date().toISOString()}.json`, CacheLocation); + req.log.fatal({ errorLocation: errorLocation.href }, 'log:index:failed'); + await fsa.write(errorLocation, JSON.stringify(Elastic.errors)); + } + + let failed = false; + for (const ret of rets) { + if (ret.status !== 'rejected') continue; + req.log.fatal({ err: ret.reason }, 'log:index:failed'); + failed = true; + } + if (failed) throw new Error('Failed to index'); + } +} diff --git a/packages/lambda-analytic-cloudfront/src/index.ts b/packages/lambda-analytic-cloudfront/src/index.ts new file mode 100644 index 000000000..8fe889cc7 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/index.ts @@ -0,0 +1,6 @@ +import { LogConfig } from '@basemaps/shared'; +import { lf } from '@linzjs/lambda'; + +import { main } from './handler.js'; + +export const handler = lf.handler(main, { tracePercent: 0, rejectOnError: true }, LogConfig.get()); diff --git a/packages/lambda-analytic-cloudfront/src/log.reader.ts b/packages/lambda-analytic-cloudfront/src/log.reader.ts new file mode 100644 index 000000000..bc0e82c69 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/log.reader.ts @@ -0,0 +1,167 @@ +import { createInterface } from 'node:readline/promises'; +import { createGunzip } from 'node:zlib'; + +import { sha256base58 } from '@basemaps/config'; +import { fsa } from '@chunkd/fs'; + +import { LogStats } from './log.stats.js'; +import { parseQueryString } from './log/query.js'; +import { getUrlHost } from './log/referer.js'; +import { parseTileUrl } from './log/tile.url.js'; +import { UaParser } from './useragent/agent.js'; + +/** +00 'date': '2017-02-09', +01 'time': '17:50:17', +02 'x-edge-location': 'MUC51', +03 'sc-bytes': '2797', // Number of bytes to viewer +04 'c-ip': '192.168.0.123', +05 'cs-method': 'GET', +06 'cs-host': 'yourdistribution.cloudfront.net', +07 'cs-uri-stem': '/', +08 'sc-status': '200', +09 'cs-referer': '-', +10 'cs-user-agent': 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)', +11 'cs-uri-query': '-', +12 'cs-cookie': '-', +13 'x-edge-result-type': 'Hit', +14 'x-edge-request-id': 'sjXpb8nMq_1ewovZ6nrojpvxIETPbo7EhF2RNtPZ_zfd0MtOW6pjlg==', +15 'x-host-header': 'example.com', +16 'cs-protocol': 'https', +17 'cs-bytes': '148', +18 'time-taken': '0.002', +19 'x-forwarded-for': '-', +20 'ssl-protocol': 'TLSv1.2', +21 'ssl-cipher': 'ECDHE-RSA-AES128-GCM-SHA256', +22 'x-edge-response-result-type': 'Hit', +23 'cs-protocol-version': 'HTTP/1.1' +*/ + +// tiles with full alpha or single solid color are approx these size +const EmptyTileSizes: Record = { + webp: 214, + png: 355, + jpeg: 650, +}; + +const IsoDateMonth = 7; // 2023-06 +const IsoDateDay = 10; // 2023-06-12 +const IsoDateHour = 13; // 2023-06-12:T01 + +/** + * Hide the full API key from the log analytics + */ +function hideApiKey(str: string): string { + if (str.startsWith('d')) return 'd' + str.slice(str.length - 6); + if (str.startsWith('c')) return 'c' + str.slice(str.length - 6); + return str; +} + +const empty: Record = { webp: 0, png: 0, jpeg: 0 }; +export function toFullDate(x: string): string { + if (x.length === IsoDateMonth) return `${x}-01T00:00:00.000Z`; + if (x.length === IsoDateDay) return `${x}T00:00:00.000Z`; + if (x.length === IsoDateHour) return `${x}:00:00.000Z`; + throw new Error('Unknown date:' + x); +} + +export const FileProcess = { + process(fileName: URL, stats: Map): Promise { + let count = 0; + const lineReader = createInterface({ input: fsa.readStream(fileName).pipe(createGunzip()), terminal: false }); + + function processLine(line: string): void { + if (line.startsWith('#')) return; + const lineData = line.split('\t'); + const status = Number(lineData[8]); + + // Ignore requests that were not actually served + if (status > 399) return; + if (status < 200) return; + // No data was served ignore! + if (status === 204) return; + + // Ignore files where no bytes were served + const bytes = Number(lineData[3]); + if (isNaN(bytes)) return; + + // Ignore anything that is not /v1/tiles + const url = lineData[7]; + if (!url.startsWith('/v1/tiles')) return; + + const date = lineData[0]; + const time = lineData[1]; + const dateTime = `${date}T${time}Z`; + + const contentLength = Number(lineData[30]); + + const { api, pipeline } = parseQueryString(lineData[11]); + + // Slice the request to the hour 2023-06-12T01 + const dateAggregate = dateTime.slice(0, IsoDateHour); + const hit = lineData[13] === 'Hit' || lineData[13] === 'RefreshHit'; + const referer = getUrlHost(lineData[9]); + + const userAgent = UaParser.parse(lineData[10]); + + const ret = parseTileUrl(status, url); + if (ret == null) return; // Couldn't parse tileInformation out!? + + // Aggregation date, api and referer + const trackId = [dateAggregate, api, referer]; + // Aggregate on useragent + if (userAgent) trackId.push(...Object.values(userAgent).map((m) => String(m))); + + let isEmpty = false; + trackId.push(ret.tileMatrix, ret.extension, String(ret.webMercatorZoom)); + if (pipeline) trackId.push(pipeline); + + // If the bytes served back to the user is low, it could be a empty tile + // compare it to known empty tile sizes + const emptyBytes = EmptyTileSizes[ret.extension]; + if (emptyBytes && contentLength === emptyBytes) { + empty[ret.extension]++; + isEmpty = true; + } + + const trackingId = trackId.join('_'); + let existing = stats.get(trackingId) as LogStats; + if (existing == null) { + existing = { + '@timestamp': toFullDate(dateAggregate), + api: hideApiKey(api), + apiType: api?.slice(0, 1), + tileMatrix: ret.tileMatrix, + tileMatrixId: ret.tileMatrixId, + tileSet: ret.tileSet, + z: ret.webMercatorZoom, + referer, + extension: ret?.extension, + ua: userAgent, + pipeline, + cacheHit: 0, + cacheMiss: 0, + total: 0, + bytes: 0, + empty: 0, + id: sha256base58(trackingId), + }; + stats.set(trackingId, existing); + } + + existing.bytes += bytes; + existing.total++; + if (isEmpty) existing.empty++; + if (hit) existing.cacheHit++; + else existing.cacheMiss++; + + count++; + } + + return new Promise((resolve, reject) => { + lineReader.on('error', (err) => reject(err)); + lineReader.on('close', () => resolve(count)); + lineReader.on('line', processLine); + }); + }, +}; diff --git a/packages/lambda-analytic-cloudfront/src/log.stats.ts b/packages/lambda-analytic-cloudfront/src/log.stats.ts new file mode 100644 index 000000000..17cb5c062 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/log.stats.ts @@ -0,0 +1,76 @@ +import { UserAgentInfo } from './useragent/parser.types.js'; + +export interface LogStats { + '@timestamp': string; + + /** + * Cut down API key consisting of the first character and the last 6 characters + */ + api: string; + + /** + * API Key type "c" | "d" + */ + apiType: string; + + /** + * Raw tile matrix name from the URL + */ + tileMatrix: string; + + /** + * Actual tile matrix used + */ + tileMatrixId: 'NZTM2000Quad' | 'WebMercatorQuad' | 'NZTM2000'; + + /** + * Name of the tile set + * + * @example 'aerial' or 'topographic' + */ + tileSet: string; + + /** + * zoom level of the request served + */ + z: number; + + /** Host that referred the request */ + referer: string; + + /** Extension that was served */ + extension: string; + + /** User agent information */ + ua?: UserAgentInfo; + /** + * Rendering pipeline if used + * + * @example "rgba" or "color-ramp" + */ + pipeline?: string; + /** + * Number of hits that were cache hits + */ + cacheHit: number; + /** + * Number of hits that were cache misses + */ + cacheMiss: number; + /** + * Total number of requests + */ + total: number; + /** + * Total bytes served + */ + bytes: number; + /** + * Total number of tiles that were empty + */ + empty: number; + /** + * Unique ID for the tracking information + */ + id: string; +} diff --git a/packages/lambda-analytic-cloudfront/src/log/__test__/tile.url.test.ts b/packages/lambda-analytic-cloudfront/src/log/__test__/tile.url.test.ts new file mode 100644 index 000000000..86b2d2a56 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/log/__test__/tile.url.test.ts @@ -0,0 +1,53 @@ +import assert from 'node:assert'; +import { describe, it } from 'node:test'; + +import { parseTileUrl } from '../tile.url.js'; + +describe('tile.url', () => { + it('should parse tile requests', () => { + assert.deepEqual(parseTileUrl(200, '/v1/tiles/aerial/NZTM2000Quad/16/32237/31326.jpeg'), { + extension: 'jpeg', + tileMatrix: 'NZTM2000Quad', + tileMatrixId: 'NZTM2000Quad', + tileSet: 'aerial', + webMercatorZoom: 18, + z: 16, + }); + + assert.deepEqual(parseTileUrl(200, '/v1/tiles/aerial/WebMercatorQuad/16/32237/31326.webp'), { + extension: 'webp', + tileMatrix: 'WebMercatorQuad', + tileMatrixId: 'WebMercatorQuad', + tileSet: 'aerial', + webMercatorZoom: 16, + z: 16, + }); + }); + + it('should parse tile matrix sets', () => { + assert.deepEqual(parseTileUrl(200, '/v1/tiles/aerial/3857/1/1/1.webp'), { + extension: 'webp', + tileMatrix: '3857', + tileMatrixId: 'WebMercatorQuad', + tileSet: 'aerial', + webMercatorZoom: 1, + z: 1, + }); + assert.deepEqual(parseTileUrl(200, '/v1/tiles/aerial/EPSG:3857/1/1/1.webp'), { + extension: 'webp', + tileMatrix: 'EPSG:3857', + tileMatrixId: 'WebMercatorQuad', + tileSet: 'aerial', + webMercatorZoom: 1, + z: 1, + }); + assert.deepEqual(parseTileUrl(200, '/v1/tiles/topographic/2193/1/1/1.pbf'), { + extension: 'pbf', + tileMatrix: '2193', + tileMatrixId: 'NZTM2000', + tileSet: 'topographic', + webMercatorZoom: 5, + z: 1, + }); + }); +}); diff --git a/packages/lambda-analytic-cloudfront/src/log/query.ts b/packages/lambda-analytic-cloudfront/src/log/query.ts new file mode 100644 index 000000000..9b1663f58 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/log/query.ts @@ -0,0 +1,31 @@ +export interface QueryStringInfo { + api: string; + pipeline?: string; +} +function getQuery(str: string): QueryStringInfo { + const urlSearch = new URLSearchParams(str); + const api = _getApi(urlSearch); + const pipeline = urlSearch.get('pipeline') ?? undefined; + return { api, pipeline }; +} + +function _getApi(url: URLSearchParams): string { + const api = url.get('api') ?? ''; + // api keys are 27 chars starting with d or c + if (api.length !== 27) return 'invalid'; + if (api.startsWith('d')) return api; + if (api.startsWith('c')) return api; + return 'invalid'; +} +const QueryMap = new Map(); + +export function parseQueryString(str: string): QueryStringInfo { + let existing = QueryMap.get(str); + if (existing == null) { + existing = getQuery(str); + QueryMap.set(str, existing); + } + // This can get very very large so periodically clear it + if (QueryMap.size > 5_000_000) QueryMap.clear(); + return existing; +} diff --git a/packages/lambda-analytic-cloudfront/src/log/referer.ts b/packages/lambda-analytic-cloudfront/src/log/referer.ts new file mode 100644 index 000000000..fcf1cd6e9 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/log/referer.ts @@ -0,0 +1,29 @@ +const hostCache = new Map(); + +export function getUrlHost(ref: string): string { + let existing = hostCache.get(ref); + if (existing == null) { + existing = _getUrlHost(ref); + hostCache.set(ref, existing); + } + return existing; +} +/** Extract the hostname from a url */ +export function _getUrlHost(ref: string): string { + if (ref == null) return 'unknown'; + if (ref === '-') return 'unknown'; + // console.log(ref) + + try { + const { hostname } = new URL(ref); + if (hostname == null) return ref; + if (hostname.startsWith('www.')) return hostname.slice(4); + return hostname; + } catch (e) { + if (!ref.startsWith('http')) return _getUrlHost('https://' + ref); + // Ignore invalid referer hostname + // eslint-disable-next-line no-console + console.log(ref); + } + return 'unknown'; +} diff --git a/packages/lambda-analytic-cloudfront/src/log/tile.url.ts b/packages/lambda-analytic-cloudfront/src/log/tile.url.ts new file mode 100644 index 000000000..04f871d79 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/log/tile.url.ts @@ -0,0 +1,87 @@ +import { GoogleTms, TileMatrixSet, TileMatrixSets } from '@basemaps/geo'; + +function isValidExt(ext: string): boolean { + switch (ext) { + case 'webp': + case 'jpeg': + case 'png': + case 'avif': + // Vector + case 'pbf': + return true; + } + return false; +} + +const tileMatrixLookup = new Map(); + +// Validate +// - /v1/tiles/aerial/EPSG:2193/12.67876636397893/11737/18011.jpeg + +export interface TileUrlInfo { + extension: string; + + tileSet: string; + + /** + * Raw tile matrix used + * + * @example "3857" "EPSG:3857" + */ + tileMatrix: string; + /** + * Tile Matrix Used + * @example "NZTM2000Quad" + */ + tileMatrixId: 'NZTM2000Quad' | 'WebMercatorQuad' | 'NZTM2000'; + + /** Zoom used from in the tile matrix */ + z: number; + + /** closes zoom level in web mercator quad */ + webMercatorZoom: number; +} +export function parseTileUrl(status: number, url: string): TileUrlInfo | undefined { + if (!url.startsWith('/v1/tiles')) return; + if (status > 399) return; + + // /v1/tiles/topographic/EPSG:3857/tile.json + // /v1/tiles/topographic/EPSG:3857/style/topolite.json + const lastDot = url.lastIndexOf('.'); + if (lastDot === -1) return; // no extension ignore + + let ext = url.slice(lastDot + 1); + if (ext === 'jpg') ext = 'jpeg'; // standardise "jpg" into "jpeg" + const tileSetType = isValidExt(ext); + if (tileSetType == null) return; + + // /v1/tiles/:tileSet/:tileMatrixId/:z/:x/:y.:ext + const urlPart = url.split('/'); + const tileSet = urlPart[3]; + const tileMatrixPart = urlPart[4]; + + let tileMatrix = tileMatrixLookup.get(tileMatrixPart); + if (tileMatrix === undefined) { + tileMatrix = TileMatrixSets.find(tileMatrixPart); + tileMatrixLookup.set(tileMatrixPart, tileMatrix); + } + if (tileMatrix == null) return; // TileMatrix not found + + const z = Number.parseInt(urlPart[5]); + + // Check tile is in valid ranges + if (isNaN(z)) return; + if (z < 0) return; + + // Convert the zoom to webmercator zoom scales + const webMercatorZoom = TileMatrixSet.convertZoomLevel(z, tileMatrix, GoogleTms); + + return { + extension: ext, + tileSet, + tileMatrix: tileMatrixPart, + tileMatrixId: tileMatrix.identifier as TileUrlInfo['tileMatrixId'], + z, + webMercatorZoom, + }; +} diff --git a/packages/lambda-analytic-cloudfront/src/useragent/__test__/parser.test.ts b/packages/lambda-analytic-cloudfront/src/useragent/__test__/parser.test.ts new file mode 100644 index 000000000..61d631e0c --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/useragent/__test__/parser.test.ts @@ -0,0 +1,195 @@ +import assert from 'node:assert'; +import { describe, it } from 'node:test'; + +import { UaParser } from '../agent.js'; + +describe('UserAgents', () => { + it('should parse common browsers', () => { + assert.deepEqual( + UaParser.parse( + 'Mozilla/5.0%20(X11;%20Linux%20x86_64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/85.0.4183.101%20Safari/537.36', + ), + { name: 'chrome', os: 'linux', variant: 'unknown', version: '85' }, + ); + assert.deepEqual( + UaParser.parse( + `Mozilla/5.0 (Linux; U; Android 2.2.1; de-de; LG-P350 Build/FRG83) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MMS/LG-Android-MMS-V1.0/1.2`, + ), + { name: 'androidbrowser', os: 'android', variant: 'unknown', version: '4' }, + ); + assert.deepEqual( + UaParser.parse('Mozilla/5.0%20(Windows%20NT%2010.0;%20WOW64;%20rv:48.0)%20Gecko/20100101%20Firefox/48.0'), + { name: 'firefox', os: 'unknown', variant: 'unknown', version: '48' }, + ); + assert.deepEqual( + UaParser.parse( + 'Mozilla/5.0%20(Linux;%20Android%2011;%20SM-A025F)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/96.0.4664.104%20Mobile%20Safari/537.36', + ), + { name: 'chrome', os: 'android', variant: 'unknown', version: '96' }, + ); + + assert.deepEqual( + UaParser.parse( + 'Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/98.0.4758.80%20Safari/537.36%20Edg/98.0.1108.43', + ), + { name: 'edge', os: 'unknown', variant: 'unknown', version: '98' }, + ); + }); + + it('should parse qgis', () => { + assert.deepEqual(UaParser.parse('QGIS/31613'), { + name: 'qgis', + version: '3.16', + os: 'unknown', + variant: 'unknown', + }); + assert.deepEqual(UaParser.parse('Mozilla/5.0%20QGIS/31006'), { + name: 'qgis', + version: '3.10', + os: 'unknown', + variant: 'unknown', + }); + assert.deepEqual(UaParser.parse('Mozilla/5.0%20QGIS/31607'), { + name: 'qgis', + version: '3.16', + os: 'unknown', + variant: 'unknown', + }); + assert.deepEqual(UaParser.parse('Mozilla/5.0%20QGIS/32402/macOS%2012.4'), { + name: 'qgis', + version: '3.24', + os: 'macos', + variant: 'unknown', + }); + assert.deepEqual(UaParser.parse('Mozilla/5.0%20QGIS/2.14.9-Essen'), { + name: 'qgis', + version: '2.14', + os: 'unknown', + variant: 'unknown', + }); + assert.deepEqual(UaParser.parse('Mozilla/5.0%20QGIS/32601/Windows%2010%20Version%202009'), { + name: 'qgis', + version: '3.26', + os: 'windows', + variant: 'unknown', + }); + assert.deepEqual(UaParser.parse('Mozilla/5.0%20QGIS/3.10.1-A%20Coru%C3%B1a'), { + name: 'qgis', + version: '3.10', + os: 'unknown', + variant: 'unknown', + }); + }); + + it('should parse ArcGIS', () => { + assert.deepEqual(UaParser.parse('ArcGIS%20Pro%202.7.3%20(00000000000)%20-%20ArcGISPro'), { + name: 'arcgis', + os: 'windows', + variant: 'pro', + version: '2.7', + }); + assert.deepEqual(UaParser.parse('ArcGIS%20Pro%203.0.0%20(00000000000)%20-%20ArcGISPro'), { + name: 'arcgis', + os: 'windows', + variant: 'pro', + version: '3.0', + }); + assert.deepEqual(UaParser.parse('ArcGIS%20Pro%202.9.3%20(00000000000)%20-%20ArcGISPro'), { + name: 'arcgis', + os: 'windows', + variant: 'pro', + version: '2.9', + }); + assert.deepEqual(UaParser.parse('ArcGIS%20Pro%202.9.2%20(00000000000)%20-%20ArcGISPro'), { + name: 'arcgis', + os: 'windows', + variant: 'pro', + version: '2.9', + }); + assert.deepEqual(UaParser.parse('ArcGIS%20Pro%202.8.0%20(00000000000)%20-%20ArcGISPro'), { + name: 'arcgis', + os: 'windows', + variant: 'pro', + version: '2.8', + }); + }); + + it('should handle software', () => { + assert.deepEqual(UaParser.parse('python-requests/2.23.0'), { + name: 'python', + os: 'unknown', + variant: 'requests', + version: '2.23.0', + }); + assert.deepEqual(UaParser.parse('MapProxy-1.12.0'), { + name: 'map-proxy', + os: 'unknown', + variant: 'unknown', + version: '1.12', + }); + assert.deepEqual(UaParser.parse('MapProxy-1.13.2'), { + name: 'map-proxy', + os: 'unknown', + variant: 'unknown', + version: '1.13', + }); + assert.deepEqual(UaParser.parse('okhttp/3.12.3'), { + name: 'okhttp', + os: 'unknown', + variant: 'unknown', + version: '3.12', + }); + assert.deepEqual(UaParser.parse('axios/0.21.1'), { + name: 'axios', + os: 'unknown', + variant: 'unknown', + version: '0.21', + }); + assert.deepEqual(UaParser.parse('Dart/2.16 (dart:io) '), { + name: 'dart', + os: 'unknown', + variant: 'unknown', + version: '2.16', + }); + assert.deepEqual(UaParser.parse('Apache-HttpClient/4.5.13'), { + name: 'apache', + os: 'unknown', + variant: 'http', + version: '4.5', + }); + }); + + it('should parse gis software', () => { + // assert.deepEqual( + // UaParser.parse('MapFishPrint/3.29.2%20Apache-HttpClient/4.5.13%20(Java/1.8.0_312)'), + // 'mapfishprint_3.29', + // ); + assert.deepEqual( + UaParser.parse( + 'FME/2022.7.43.22343%20%20libcurl/7.79.1%20(OpenSSL/1.1.1n)%20Schannel%20zlib/1.2.11%20WinIDN%20libssh2/1.10.0%20nghttp2/1.44.0', + ), + { name: 'fme', os: 'unknown', variant: 'unknown', version: '2022.7' }, + ); + + assert.deepEqual(UaParser.parse('JOSM/1.5 (18513 en) Windows 10 64-Bit Java/11.0.15'), { + name: 'josm', + os: 'unknown', + variant: 'unknown', + version: '1.5', + }); + + assert.deepEqual(UaParser.parse('GDAL WMS driver (http://www.gdal.org/frmt_wms.html)'), { + name: 'gdal', + os: 'unknown', + variant: 'wms', + version: 'unknown', + }); + + assert.deepEqual(UaParser.parse('MapInfoPro/21.0.0.0172 (MapInfoPro.exe) '), { + name: 'unknown', + os: 'unknown', + variant: 'unknown', + version: 'unknown', + }); + }); +}); diff --git a/packages/lambda-analytic-cloudfront/src/useragent/agent.ts b/packages/lambda-analytic-cloudfront/src/useragent/agent.ts new file mode 100644 index 000000000..4e67400b1 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/useragent/agent.ts @@ -0,0 +1,9 @@ +import { Gis } from './agents/gis.js'; +import { Bot, Programming } from './agents/programming.js'; +import { UserAgentParsers } from './parser.js'; + +export const UaParser = new UserAgentParsers(); + +Object.entries(Programming).forEach(([key, create]) => UaParser.addParser(key, create)); +Object.entries(Gis).forEach(([key, create]) => UaParser.addParser(key, create)); +Object.entries(Bot).forEach(([key, create]) => UaParser.addParser(key, create)); diff --git a/packages/lambda-analytic-cloudfront/src/useragent/agents/gis.ts b/packages/lambda-analytic-cloudfront/src/useragent/agents/gis.ts new file mode 100644 index 000000000..243bbe128 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/useragent/agents/gis.ts @@ -0,0 +1,85 @@ +import { isValidOs, UserAgentOs, UserAgentParser } from '../parser.types.js'; + +// Mozilla/5.0 QGIS/32400/Windows 10 Version 2009 +function guessQgisOs(ua: string): UserAgentOs | undefined { + if (ua.includes('/windows')) return 'windows'; + if (ua.includes('/mac')) return 'macos'; +} + +const ArcGis: Record = { + // ArcGISRuntime-NET/100.11.2 (Windows..... + // ArcGISRuntime-Qt/100.10 ... + 'ArcGISRuntime-': (ua: string) => { + const chunks = ua.split('/'); + const variant = chunks[0].slice('ArcGISRuntime-'.length).toLowerCase(); + const version = chunks[1].slice(0, chunks[1].indexOf('.')); + const os = chunks[1].split(' ')[1].slice(1); + if (isValidOs(os)) return { name: 'arcgis', variant, version, os }; + return { name: 'arcgis', variant, version }; + }, + + // ArcGIS Pro 2.7.3 (00000000000) - ArcGISPro + // ArcGIS Pro 3.0.0 (00000000000) - ArcGISPro + 'ArcGIS Pro': (ua: string) => { + return { + name: 'arcgis', + variant: 'pro', + version: ua.slice('ArcGIS Pro'.length, ua.lastIndexOf('.')).trim(), + os: 'windows', + }; // assume arcgis is windows + }, + + 'ArcGIS Client': () => { + return { name: 'arcgis', variant: 'client' }; + }, +}; + +function guessJosmOs(ua: string): { os: 'linux' } | undefined { + if (ua.includes('linux')) return { os: 'linux' }; + return; +} + +export const Gis: Record = { + ...ArcGis, + // QGIS/31613 + 'QGIS/': (ua) => { + const qgisVersion = Number(ua.split('/')[1]); + if (isNaN(qgisVersion)) return { name: 'qgis', os: guessQgisOs(ua) }; + return { name: 'qgis', version: (qgisVersion / 10000).toFixed(2), os: guessQgisOs(ua) }; + }, + 'Mozilla/5.0 QGIS/': (ua) => { + const chunk = ua.slice('Mozilla/5.0 QGIS/'.length).split('/')[0]; + if (chunk == null) return { name: 'qgis', os: guessQgisOs(ua) }; + // Mozilla/5.0 QGIS/2.18.22 + // Mozilla/5.0 QGIS/2.14.9-Essen + if (chunk.includes('.')) + return { name: 'qgis', version: chunk.slice(0, chunk.lastIndexOf('.')), os: guessQgisOs(ua) }; + + // Mozilla/5.0 QGIS/31400 + // Mozilla/5.0 QGIS/32400/Windows 10 Version 2009 + const qgisVersion = Number(chunk); + if (isNaN(qgisVersion)) return { name: 'qgis', os: guessQgisOs(ua) }; + return { name: 'qgis', version: `${(qgisVersion / 10000).toFixed(2)}`, os: guessQgisOs(ua) }; + }, + + // FME/2022.7.43.22343 libcurl/7.79.1 (OpenSSL/1.1.1n) Schannel zlib/1.2.11 WinIDN libssh2/1.10.0 nghttp2/1.44.0 + 'FME/': (ua) => { + return { name: 'fme', version: ua.slice('FME/'.length, ua.indexOf('.', 9)) }; + }, + + // GDAL WMS driver (http://www.gdal.org/frmt_wms.html) + 'GDAL WMS': () => { + return { name: 'gdal', variant: 'wms' }; + }, + + // MapProxy-1.12.0 + 'MapProxy-': (ua) => { + return { name: 'map-proxy', version: ua.slice('MapProxy-'.length, ua.lastIndexOf('.')) }; + }, + + // JOSM/1.5 (18700 en) Linux Freedesktop.org SDK 22.08 (Flatpak runtime) Java/17.0.6 29147 + // JOSM/1.5 (18700 en) Linux Mint 20.3 Java/17.0.5 + 'JOSM/': (ua) => { + return { name: 'josm', version: ua.slice('JOSM/'.length, ua.indexOf(' ')), ...guessJosmOs(ua) }; + }, +}; diff --git a/packages/lambda-analytic-cloudfront/src/useragent/agents/programming.ts b/packages/lambda-analytic-cloudfront/src/useragent/agents/programming.ts new file mode 100644 index 000000000..d85ce05a8 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/useragent/agents/programming.ts @@ -0,0 +1,25 @@ +import { UserAgentParser } from '../parser.types.js'; + +// Programming languages internal http clients +export const Programming: Record = { + 'python-requests/': (ua) => ({ name: 'python', variant: 'requests', version: ua.slice('python-requests/'.length) }), + 'python-urllib/': (ua) => ({ name: 'python', variant: 'urllib', version: ua.slice('python-urllib/'.length) }), + 'python/': (ua) => ({ name: 'python', version: ua.slice('python/'.length, ua.lastIndexOf('.')) }), + 'java/': (ua) => ({ name: 'java', version: ua.slice('java/'.length, ua.lastIndexOf('.')) }), + 'axios/': (ua) => ({ name: 'axios', version: ua.slice('axios/'.length, ua.lastIndexOf('.')).replace('/', '_') }), + 'okhttp/': (ua) => ({ name: 'okhttp', version: ua.slice('okhttp/'.length, ua.lastIndexOf('.')).replace('/', '_') }), + 'Go-http-client/': (ua) => ({ name: 'go', variant: 'http', version: ua.slice('Go-http-client/'.length) }), + 'Dart/': (ua) => ({ name: 'dart', version: ua.split(' ')[0].slice('Dart/'.length) }), + 'Apache-HttpClient/': (ua) => ({ + name: 'apache', + variant: 'http', + version: ua.slice('Apache-HttpClient/'.length, ua.lastIndexOf('.')), + }), + flutter_: () => ({ name: 'flutter' }), +}; + +// Bots +export const Bot: Record = { + 'Googlebot-Image/': (ua) => ({ name: 'bot', variant: 'google', version: ua.split('/').at(1) }), + 'AdsBot-Google': () => ({ name: 'bot', variant: 'google' }), +}; diff --git a/packages/lambda-analytic-cloudfront/src/useragent/parser.ts b/packages/lambda-analytic-cloudfront/src/useragent/parser.ts new file mode 100644 index 000000000..b53cd56e6 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/useragent/parser.ts @@ -0,0 +1,106 @@ +import UA from 'ua-parser-js'; + +import { UserAgentInfo, UserAgentOs, UserAgentParser } from './parser.types.js'; + +const OsMap: Record = { ubuntu: 'linux' }; +const UaParser: ParserConfig = { name: 'ua-parser-js', hit: 0 }; +const Skipped: ParserConfig = { name: 'skipped', hit: 0 }; +interface ParserConfig { + name: string; + hit: number; + create?: UserAgentParser; +} +interface ParserCache { + hit: number; + parser: ParserConfig; + value?: UserAgentInfo; +} + +export class UserAgentParsers { + parsers = new Map(); + cache = new Map(); + + addParser(value: string, create: UserAgentParser): void { + const char = value[0].toLowerCase(); + const parser = this.parsers.get(char) ?? []; + parser.push({ name: value.toLowerCase(), create, hit: 0 }); + this.parsers.set(char, parser); + } + + parse(userAgent: string): UserAgentInfo | undefined { + const existing = this.cache.get(userAgent); + if (existing) { + existing.hit++; + existing.parser.hit++; + return existing.value; + } + const ret = this._parse(userAgent); + if (ret.value) { + if (ret.value.version == null) ret.value.version = 'unknown'; + if (ret.value.variant == null) ret.value.variant = 'unknown'; + if (ret.value.os == null) ret.value.os = 'unknown'; + } + ret.hit++; + ret.parser.hit++; + this.cache.set(userAgent, ret); + return ret.value; + } + + _parse(userAgent: string | undefined): ParserCache { + if (userAgent == null || userAgent === '' || userAgent === '-' || userAgent === 'Mozilla/5.0') { + return { value: { name: 'empty' }, parser: Skipped, hit: 0 }; + } + const parsedName = decodeURI(userAgent); + const lowered = parsedName.toLowerCase(); + + // Is there a custom parser for the user agent string + const parsers = this.parsers.get(lowered[0]) ?? []; + for (const parser of parsers) { + if (lowered.startsWith(parser.name)) return { value: parser.create?.(lowered), parser, hit: 0 }; + } + + // No custom parser attempt to pull the information from a generic user agent parser + const ua = UA(userAgent); + const output: Partial = {}; + + if (ua.os.name) { + output.os = ua.os.name.toLowerCase().replace(/ /g, '') as UserAgentOs; + if (OsMap[output.os]) output.os = OsMap[output.os]; + } + + if (ua.browser.name) { + output.name = ua.browser.name.replace(/ /g, '').toLowerCase(); + if (ua.browser.version) output.version = ua.browser.version.slice(0, ua.browser.version.indexOf('.')); + return { value: output as UserAgentInfo, parser: UaParser, hit: 0 }; + } + + if (ua.os.name === 'Android') { + const slashIndex = parsedName.indexOf('/'); + if (slashIndex > -1) { + output.name = 'android'; + output.variant = lowered.slice(0, slashIndex); + return { value: output as UserAgentInfo, parser: UaParser, hit: 0 }; + } + } + // IOS apps + // Tracks%2520NZ/1 CFNetwork/1335.0.3 Darwin/21.6.0' + // com.spatialnetworks.fulcrum/4.0.1%20iPhone/16.4.1%20hw/iPhone13_2 + if ( + ua.os.name === 'iOS' || + lowered.includes('iphone/') || + lowered.includes('ios/') || + lowered.includes('ipad/') || + lowered.includes('ios simulator/') + ) { + const slashIndex = parsedName.indexOf('/'); + if (slashIndex > -1) { + output.os = output.os ?? 'ios'; + output.name = 'ios'; + output.variant = decodeURIComponent(lowered.slice(0, slashIndex)).replace(/ /g, ''); + return { value: output as UserAgentInfo, parser: UaParser, hit: 0 }; + } + } + + return { value: { name: 'unknown' }, parser: Skipped, hit: 0 }; + } +} diff --git a/packages/lambda-analytic-cloudfront/src/useragent/parser.types.ts b/packages/lambda-analytic-cloudfront/src/useragent/parser.types.ts new file mode 100644 index 000000000..262ac81bc --- /dev/null +++ b/packages/lambda-analytic-cloudfront/src/useragent/parser.types.ts @@ -0,0 +1,14 @@ +export type UserAgentParser = (ua: string) => UserAgentInfo | undefined; +export interface UserAgentInfo { + name: string; + variant?: string; + version?: string; + os?: UserAgentOs; +} + +export type UserAgentOs = 'windows' | 'macos' | 'ios' | 'android' | 'linux' | 'unknown'; +export const ValidOs = new Set(['windows', 'macos', 'ios', 'android', 'linux']); + +export function isValidOs(os: string): os is UserAgentOs { + return ValidOs.has(os); +} diff --git a/packages/lambda-analytic-cloudfront/tsconfig.json b/packages/lambda-analytic-cloudfront/tsconfig.json new file mode 100644 index 000000000..cdf874685 --- /dev/null +++ b/packages/lambda-analytic-cloudfront/tsconfig.json @@ -0,0 +1,10 @@ +{ + "extends": "../../tsconfig.base.json", + + "compilerOptions": { + "outDir": "./build", + "rootDir": "./src" + }, + "include": ["src"], + "references": [{ "path": "../config" }, { "path": "../geo" }, { "path": "../shared" }] +} diff --git a/packages/lambda-analytic-cloudfront/typedoc.json b/packages/lambda-analytic-cloudfront/typedoc.json new file mode 100644 index 000000000..6bf46780c --- /dev/null +++ b/packages/lambda-analytic-cloudfront/typedoc.json @@ -0,0 +1,4 @@ +{ + "entryPoints": ["./src/**/*.ts"], + "exclude": ["./**/__tests__/*.ts"] +} diff --git a/packages/shared/src/const.ts b/packages/shared/src/const.ts index 052bb2046..1f3f35a9d 100644 --- a/packages/shared/src/const.ts +++ b/packages/shared/src/const.ts @@ -53,8 +53,20 @@ export const Env = { Analytics: { CloudFrontId: 'ANALYTICS_CLOUD_FRONT_ID', CloudFrontSourceBucket: 'ANALYTICS_CLOUD_FRONT_SOURCE_BUCKET', + + /** Where to store the analytic cache data */ CacheBucket: 'ANALYTICS_CACHE_BUCKET', - }, + + /** Max number of records to process in the analytics process */ + MaxRecords: 'ANALYTICS_MAX_RECORDS', + + /** Elastic server Id */ + ElasticId: 'ELASTIC_ID', + /** ElasticSearch's API key */ + ElasticApiKey: 'ELASTIC_API_KEY', + /** Index to use for storing analytic data */ + ElasticIndexName: 'ELASTIC_INDEX_NAME', + } as const, /** Load a environment var defaulting to defaultOutput if it does not exist */ get(envName: string): string | undefined {