From 4361e5d4e142882e15d6506d6ec7ee05ac5fb971 Mon Sep 17 00:00:00 2001 From: "d.loavid-loe" Date: Thu, 7 Dec 2023 13:11:20 +0100 Subject: [PATCH] rewrite parser --- .gitignore | 1 - .vscode/settings.json | 3 + backend/data/parser.ts | 171 ++++++++++++++++++++++++++++++------- backend/data/run-parser.ts | 4 + backend/db.ts | 91 +++++--------------- backend/package.json | 4 +- lump-sum-update.md | 2 +- 7 files changed, 171 insertions(+), 105 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 backend/data/run-parser.ts diff --git a/.gitignore b/.gitignore index 600d2d33..e69de29b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +0,0 @@ -.vscode \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..22d90404 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "typescript.tsdk": "backend/node_modules/typescript/lib" +} diff --git a/backend/data/parser.ts b/backend/data/parser.ts index ba145b3d..0930275c 100644 --- a/backend/data/parser.ts +++ b/backend/data/parser.ts @@ -1,13 +1,47 @@ import fs from 'fs' +import { CountryLumpSum } from '../../common/types.js' +import Country from '../models/country.js' -export function loadLumpSums() { - const lumpSums: { validFrom: Date; data: any }[] = [] - fs.readdirSync('./data').forEach((file) => { +interface RawLumpSum { + country: string + catering8: string + catering24: string + overnight: string +} + +interface RawLumpSumWithCities extends RawLumpSum { + spezials?: { + city: string + catering8: string + catering24: string + overnight: string + }[] +} + +export type LumpSumsJSON = { data: LumpSumWithCountryCode[]; validFrom: number }[] +type LumpSumWithCountryCode = Omit & { countryCode: string } +type LumpSumWithCountryName = Omit & { country: string } + +function isRawLumpSum(data: any): data is RawLumpSum { + return typeof data.country === 'string' +} + +function assertAreRawLumpSums(data: any[]): asserts data is RawLumpSum[] { + for (const item of data) { + if (!isRawLumpSum(item)) { + throw TypeError('Raw lump sums are of wrong type: ' + item) + } + } +} + +export async function parseLumpSumsFiles() { + const lumpSums: LumpSumsJSON = [] + fs.readdirSync('./data').forEach(async function (file) { const matched = file.match(/lumpSums_(\d{4}-\d{2}-\d{2})\.tsv/i) if (matched && matched.length > 1) { const dataStr = fs.readFileSync('./data/' + file, 'utf8') - const validFrom = new Date(matched[1]) - const data = parseRawLumpSums(dataStr) + const validFrom = new Date(matched[1]).valueOf() + const data = await parseRawLumpSums(dataStr) lumpSums.push({ validFrom, data }) } }) @@ -15,10 +49,89 @@ export function loadLumpSums() { return lumpSums } +export async function parseRawLumpSums(dataStr: string): Promise { + const refinedString = await fixTableSpezialties(dataStr) + const data = csvToObjects(refinedString, '\t', ',', '') + assertAreRawLumpSums(data) + const rawLumpSums = combineSpezials(data) + const lumpSums: LumpSumWithCountryCode[] = [] + for (const rawLumpSum of rawLumpSums) { + lumpSums.push(await findCountryCode(convertRawLumpSum(rawLumpSum))) + } + return lumpSums +} + +function combineSpezials(rawLumpSums: (RawLumpSum & { city?: string })[]): RawLumpSumWithCities[] { + const general = /im Übrigen/i + const spezialStart = /^–\s{2,}(.*)/i + var spezials = [] + for (var i = rawLumpSums.length - 1; i >= 0; i--) { + const matched = rawLumpSums[i].country.match(spezialStart) + if (matched && matched.length > 1) { + rawLumpSums[i].city = matched[1] + delete (rawLumpSums[i] as any).country + spezials.push(rawLumpSums[i]) + rawLumpSums.splice(i, 1) + } else if (spezials.length > 0) { + for (var j = spezials.length - 1; j >= 0; j--) { + if (general.test(spezials[j].city as string)) { + delete spezials[j].city + Object.assign(rawLumpSums[i], spezials[j]) + spezials.splice(j, 1) + break + } + } + ;(rawLumpSums[i] as any).spezials = spezials + spezials = [] + } + } + return rawLumpSums +} + +async function findCountryCode(lumpSum: LumpSumWithCountryName, countryNameLanguage = 'de'): Promise { + const conditions: any = {} + conditions.$or = [{}, {}] + conditions.$or[0]['name.' + countryNameLanguage] = lumpSum.country + conditions.$or[1]['alias.' + countryNameLanguage] = lumpSum.country + const country = await Country.findOne(conditions).lean() + if (!country) { + throw Error('"' + lumpSum.country + '" not found!') + } + const lumpSumWithCode: LumpSumWithCountryCode = Object.assign(lumpSum, { countryCode: country._id, country: undefined }) + return lumpSumWithCode +} + +function convertRawLumpSum(raw: RawLumpSumWithCities): LumpSumWithCountryName { + const spezials: CountryLumpSum['spezials'] = [] + if (raw.spezials) { + for (const spezial of raw.spezials) { + spezials.push({ + catering24: parseInt(spezial.catering24, 10), + catering8: parseInt(spezial.catering8, 10), + overnight: parseInt(spezial.overnight, 10), + city: spezial.city + }) + } + } + + return { + country: raw.country, + catering24: parseInt(raw.catering24, 10), + catering8: parseInt(raw.catering8, 10), + overnight: parseInt(raw.overnight, 10), + spezials + } +} + /** * @returns Array of JS Objects */ -export function csvToObjects(csv: string, separator = '\t', arraySeparator = ', '): { [key: string]: string | string[] }[] { +export function csvToObjects( + csv: string, + separator = '\t', + arraySeparator = ', ', + resultForEmptyString: any = undefined +): { [key: string]: string | string[] }[] { var lines = csv.split('\n') var result = [] var headers = lines[0].split(separator) @@ -28,11 +141,16 @@ export function csvToObjects(csv: string, separator = '\t', arraySeparator = ', break } var currentline = lines[i].split(separator) + if (currentline.length !== headers.length) { + throw Error('Line (#' + (i + 1) + ') has other length than header: ' + lines[i]) + } for (var j = 0; j < headers.length; j++) { // search for [] to identify arrays const match = currentline[j].match(/^\[(.*)\]$/) if (match === null) { - if (currentline[j] !== '') { + if (currentline[j] === '') { + obj[headers[j]] = resultForEmptyString + } else { obj[headers[j]] = currentline[j] } } else { @@ -44,30 +162,21 @@ export function csvToObjects(csv: string, separator = '\t', arraySeparator = ', return result } -export function parseRawLumpSums(dataStr: string) { - const general = /im Übrigen/i - const spezialStart = /^–\s{2,}(.*)/i - const data: { [key: string]: string | string[] | { [key: string]: string | string[] }[] }[] = csvToObjects(dataStr) - var spezials = [] - for (var i = data.length - 1; i >= 0; i--) { - const matched = (data[i].country as string).match(spezialStart) - if (matched && matched.length > 1) { - data[i].city = matched[1] - delete data[i].country - spezials.push(data[i]) - data.splice(i, 1) - } else if (spezials.length > 0) { - for (var j = spezials.length - 1; j >= 0; j--) { - if (general.test(spezials[j].city as string)) { - delete spezials[j].city - Object.assign(data[i], spezials[j]) - spezials.splice(j, 1) - break - } - } - data[i].spezials = spezials as { [key: string]: string | string[] }[] - spezials = [] +async function fixTableSpezialties(dataStr: string): Promise { + // Remove empty Lines + var result = dataStr.replace(/^\t+\n/gm, '') + + // Remove line breaks inside quotes + const escapedCells = result.matchAll(/".*(\n).*"/dgm) + for await (const match of escapedCells) { + if ((match as any).indicies) { + const m = (match as any).indicies[1] + result = result.slice(0, m[0]) + result.slice(m[1]) } } - return data + //Remove quotes + var result = result.replace(/"/gm, '') + + console.log(result) + return result } diff --git a/backend/data/run-parser.ts b/backend/data/run-parser.ts new file mode 100644 index 00000000..7961f30f --- /dev/null +++ b/backend/data/run-parser.ts @@ -0,0 +1,4 @@ +import '../db.js' +import { parseLumpSumsFiles } from './parser.js' + +parseLumpSumsFiles() diff --git a/backend/db.ts b/backend/db.ts index 591e9837..cdb261e5 100644 --- a/backend/db.ts +++ b/backend/db.ts @@ -12,6 +12,7 @@ import mongoose, { Model } from 'mongoose' import i18n from './i18n.js' import { CountryLumpSum } from '../common/types.js' import Organisation from './models/organisation.js' +import { LumpSumsJSON } from './data/parser.js' await connectDB() @@ -43,7 +44,7 @@ async function initDB() { } await initer(Currency, 'currencies', currencies) await initer(Country, 'countries', countries) - await addAllLumpSums() + //await addLumpSumsToCountries(iLumpSums) initer(HealthInsurance, 'health insurances', healthInsurances) initer(Organisation, 'organisation', organisations) } @@ -58,78 +59,28 @@ async function initer(model: Model, name: string, data: T[]) { } } -async function addAllLumpSums() { - iLumpSums.sort((a, b) => new Date(a.validFrom).valueOf() - new Date(b.validFrom).valueOf()) - for (const lumpSum of iLumpSums) { - const result = await addLumpSumsToCountries(lumpSum.data, new Date(lumpSum.validFrom), 'de') - console.log( - 'Lump sum from ' + - lumpSum.validFrom + - ': ' + - result.success.length + - ' updated - ' + - result.noUpdate.length + - ' not updated - ' + - result.noCountryFound.length + - ' no country found' - ) - for (const notFound of result.noCountryFound) { - console.log(notFound.country) - } - } -} - -export async function addLumpSumsToCountries(lumpSums: (typeof iLumpSums)[0]['data'], validFrom: Date, countryNameLanguage = 'de') { - const conditions: any = {} - const noCountryFound = [] - const success = [] - const noUpdate = [] - for (const lumpSum of lumpSums) { - conditions.$or = [{}, {}] - conditions.$or[0]['name.' + countryNameLanguage] = lumpSum.country - conditions.$or[1]['alias.' + countryNameLanguage] = lumpSum.country - - const country = await Country.findOne(conditions) - if (country) { - var newData = true - for (const countrylumpSums of country.lumpSums) { - if ((countrylumpSums.validFrom as Date).valueOf() >= validFrom.valueOf()) { - newData = false - break +export async function addLumpSumsToCountries(lumpSumsJSON: LumpSumsJSON) { + lumpSumsJSON.sort((a, b) => a.validFrom - b.validFrom) + for (const lumpSums of lumpSumsJSON) { + for (const lumpSum of lumpSums.data) { + const country = await Country.findOne({ _id: lumpSum.countryCode }) + if (country) { + var newData = true + for (const countrylumpSums of country.lumpSums) { + if ((countrylumpSums.validFrom as Date).valueOf() >= lumpSums.validFrom) { + newData = false + break + } + } + if (newData) { + const newLumpSum: CountryLumpSum = Object.assign({ validFrom: new Date(lumpSums.validFrom) }, lumpSum) + country.lumpSums.push(newLumpSum) + country.markModified('lumpSums') + country.save() } - } - if (newData) { - const newLumpSum: CountryLumpSum = Object.assign({ validFrom }, convertRawLumpSum(lumpSum)) - country.lumpSums.push(newLumpSum) - country.markModified('lumpSums') - success.push(await country.save()) } else { - noUpdate.push(country) + throw Error('No Country with id "' + lumpSum.countryCode + '" found') } - } else { - noCountryFound.push(lumpSum) } } - return { success, noUpdate, noCountryFound } -} - -function convertRawLumpSum(raw: (typeof iLumpSums)[0]['data'][0]): Omit { - const spezials: CountryLumpSum['spezials'] = [] - if (raw.spezials) { - for (const spezial of raw.spezials as { [key: string]: string | string[] }[]) { - spezials.push({ - catering24: parseInt(spezial.catering24 as string, 10), - catering8: parseInt(spezial.catering8 as string, 10), - overnight: parseInt(spezial.overnight as string, 10), - city: spezial.city as string - }) - } - } - - return { - catering24: parseInt(raw.catering24 as string, 10), - catering8: parseInt(raw.catering8 as string, 10), - overnight: parseInt(raw.overnight as string, 10), - spezials - } } diff --git a/backend/package.json b/backend/package.json index af0a76fb..e9e2e83b 100644 --- a/backend/package.json +++ b/backend/package.json @@ -48,7 +48,7 @@ "test:production": "npm run build && ava --serial dist/build/tests/**/*", "test:development": "npm run build && ava --serial dist/app/tests/**/*", "parse-lumpsums": "npm run parse-lumpsums:$NODE_ENV", - "parse-lumpsums:production": "npm run build && node dist/build/data/parser.js", - "parse-lumpsums:development": "npm run build && node dist/app/data/parser.js" + "parse-lumpsums:production": "npm run build && node dist/build/data/run-parser.js", + "parse-lumpsums:development": "npm run build && node dist/app/data/run-parser.js" } } \ No newline at end of file diff --git a/lump-sum-update.md b/lump-sum-update.md index c9c1a7f6..1081e766 100644 --- a/lump-sum-update.md +++ b/lump-sum-update.md @@ -8,4 +8,4 @@ country catering24 catering8 overnight ``` -4. run `docker-compose exec -T backend npm parse-lumpsums` +4. run `docker compose exec -T backend npm run parse-lumpsums`