Skip to content

Commit

Permalink
rewrite parser
Browse files Browse the repository at this point in the history
  • Loading branch information
david-loe committed Dec 7, 2023
1 parent 7cc07cf commit 4361e5d
Show file tree
Hide file tree
Showing 7 changed files with 171 additions and 105 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
.vscode
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"typescript.tsdk": "backend/node_modules/typescript/lib"
}
171 changes: 140 additions & 31 deletions backend/data/parser.ts
Original file line number Diff line number Diff line change
@@ -1,24 +1,137 @@
import fs from 'fs'
import { CountryLumpSum } from '../../common/types.js'
import Country from '../models/country.js'

export function loadLumpSums() {
const lumpSums: { validFrom: Date; data: any }[] = []
fs.readdirSync('./data').forEach((file) => {
interface RawLumpSum {
country: string
catering8: string
catering24: string
overnight: string
}

interface RawLumpSumWithCities extends RawLumpSum {
spezials?: {
city: string
catering8: string
catering24: string
overnight: string
}[]
}

export type LumpSumsJSON = { data: LumpSumWithCountryCode[]; validFrom: number }[]
type LumpSumWithCountryCode = Omit<CountryLumpSum, 'validFrom'> & { countryCode: string }
type LumpSumWithCountryName = Omit<CountryLumpSum, 'validFrom'> & { country: string }

function isRawLumpSum(data: any): data is RawLumpSum {
return typeof data.country === 'string'
}

function assertAreRawLumpSums(data: any[]): asserts data is RawLumpSum[] {
for (const item of data) {
if (!isRawLumpSum(item)) {
throw TypeError('Raw lump sums are of wrong type: ' + item)
}
}
}

export async function parseLumpSumsFiles() {
const lumpSums: LumpSumsJSON = []
fs.readdirSync('./data').forEach(async function (file) {
const matched = file.match(/lumpSums_(\d{4}-\d{2}-\d{2})\.tsv/i)
if (matched && matched.length > 1) {
const dataStr = fs.readFileSync('./data/' + file, 'utf8')
const validFrom = new Date(matched[1])
const data = parseRawLumpSums(dataStr)
const validFrom = new Date(matched[1]).valueOf()
const data = await parseRawLumpSums(dataStr)
lumpSums.push({ validFrom, data })
}
})
if (!fs.existsSync('./data/lumpSums.json')) fs.writeFileSync('./data/lumpSums.json', JSON.stringify(lumpSums), 'utf-8')
return lumpSums
}

export async function parseRawLumpSums(dataStr: string): Promise<LumpSumWithCountryCode[]> {
const refinedString = await fixTableSpezialties(dataStr)
const data = csvToObjects(refinedString, '\t', ',', '')
assertAreRawLumpSums(data)
const rawLumpSums = combineSpezials(data)
const lumpSums: LumpSumWithCountryCode[] = []
for (const rawLumpSum of rawLumpSums) {
lumpSums.push(await findCountryCode(convertRawLumpSum(rawLumpSum)))
}
return lumpSums
}

function combineSpezials(rawLumpSums: (RawLumpSum & { city?: string })[]): RawLumpSumWithCities[] {
const general = /im Übrigen/i
const spezialStart = /^\s{2,}(.*)/i
var spezials = []
for (var i = rawLumpSums.length - 1; i >= 0; i--) {
const matched = rawLumpSums[i].country.match(spezialStart)
if (matched && matched.length > 1) {
rawLumpSums[i].city = matched[1]
delete (rawLumpSums[i] as any).country
spezials.push(rawLumpSums[i])
rawLumpSums.splice(i, 1)
} else if (spezials.length > 0) {
for (var j = spezials.length - 1; j >= 0; j--) {
if (general.test(spezials[j].city as string)) {
delete spezials[j].city
Object.assign(rawLumpSums[i], spezials[j])
spezials.splice(j, 1)
break
}
}
;(rawLumpSums[i] as any).spezials = spezials
spezials = []
}
}
return rawLumpSums
}

async function findCountryCode(lumpSum: LumpSumWithCountryName, countryNameLanguage = 'de'): Promise<LumpSumWithCountryCode> {
const conditions: any = {}
conditions.$or = [{}, {}]
conditions.$or[0]['name.' + countryNameLanguage] = lumpSum.country
conditions.$or[1]['alias.' + countryNameLanguage] = lumpSum.country
const country = await Country.findOne(conditions).lean()
if (!country) {
throw Error('"' + lumpSum.country + '" not found!')
}
const lumpSumWithCode: LumpSumWithCountryCode = Object.assign(lumpSum, { countryCode: country._id, country: undefined })
return lumpSumWithCode
}

function convertRawLumpSum(raw: RawLumpSumWithCities): LumpSumWithCountryName {
const spezials: CountryLumpSum['spezials'] = []
if (raw.spezials) {
for (const spezial of raw.spezials) {
spezials.push({
catering24: parseInt(spezial.catering24, 10),
catering8: parseInt(spezial.catering8, 10),
overnight: parseInt(spezial.overnight, 10),
city: spezial.city
})
}
}

return {
country: raw.country,
catering24: parseInt(raw.catering24, 10),
catering8: parseInt(raw.catering8, 10),
overnight: parseInt(raw.overnight, 10),
spezials
}
}

/**
* @returns Array of JS Objects
*/
export function csvToObjects(csv: string, separator = '\t', arraySeparator = ', '): { [key: string]: string | string[] }[] {
export function csvToObjects(
csv: string,
separator = '\t',
arraySeparator = ', ',
resultForEmptyString: any = undefined
): { [key: string]: string | string[] }[] {
var lines = csv.split('\n')
var result = []
var headers = lines[0].split(separator)
Expand All @@ -28,11 +141,16 @@ export function csvToObjects(csv: string, separator = '\t', arraySeparator = ',
break
}
var currentline = lines[i].split(separator)
if (currentline.length !== headers.length) {
throw Error('Line (#' + (i + 1) + ') has other length than header: ' + lines[i])
}
for (var j = 0; j < headers.length; j++) {
// search for [] to identify arrays
const match = currentline[j].match(/^\[(.*)\]$/)
if (match === null) {
if (currentline[j] !== '') {
if (currentline[j] === '') {
obj[headers[j]] = resultForEmptyString
} else {
obj[headers[j]] = currentline[j]
}
} else {
Expand All @@ -44,30 +162,21 @@ export function csvToObjects(csv: string, separator = '\t', arraySeparator = ',
return result
}

export function parseRawLumpSums(dataStr: string) {
const general = /im Übrigen/i
const spezialStart = /^\s{2,}(.*)/i
const data: { [key: string]: string | string[] | { [key: string]: string | string[] }[] }[] = csvToObjects(dataStr)
var spezials = []
for (var i = data.length - 1; i >= 0; i--) {
const matched = (data[i].country as string).match(spezialStart)
if (matched && matched.length > 1) {
data[i].city = matched[1]
delete data[i].country
spezials.push(data[i])
data.splice(i, 1)
} else if (spezials.length > 0) {
for (var j = spezials.length - 1; j >= 0; j--) {
if (general.test(spezials[j].city as string)) {
delete spezials[j].city
Object.assign(data[i], spezials[j])
spezials.splice(j, 1)
break
}
}
data[i].spezials = spezials as { [key: string]: string | string[] }[]
spezials = []
async function fixTableSpezialties(dataStr: string): Promise<string> {
// Remove empty Lines
var result = dataStr.replace(/^\t+\n/gm, '')

// Remove line breaks inside quotes
const escapedCells = result.matchAll(/".*(\n).*"/dgm)
for await (const match of escapedCells) {
if ((match as any).indicies) {
const m = (match as any).indicies[1]
result = result.slice(0, m[0]) + result.slice(m[1])
}
}
return data
//Remove quotes
var result = result.replace(/"/gm, '')

console.log(result)
return result
}
4 changes: 4 additions & 0 deletions backend/data/run-parser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import '../db.js'
import { parseLumpSumsFiles } from './parser.js'

parseLumpSumsFiles()
91 changes: 21 additions & 70 deletions backend/db.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import mongoose, { Model } from 'mongoose'
import i18n from './i18n.js'
import { CountryLumpSum } from '../common/types.js'
import Organisation from './models/organisation.js'
import { LumpSumsJSON } from './data/parser.js'

await connectDB()

Expand Down Expand Up @@ -43,7 +44,7 @@ async function initDB() {
}
await initer<any>(Currency, 'currencies', currencies)
await initer<any>(Country, 'countries', countries)
await addAllLumpSums()
//await addLumpSumsToCountries(iLumpSums)
initer(HealthInsurance, 'health insurances', healthInsurances)
initer<any>(Organisation, 'organisation', organisations)
}
Expand All @@ -58,78 +59,28 @@ async function initer<T>(model: Model<T>, name: string, data: T[]) {
}
}

async function addAllLumpSums() {
iLumpSums.sort((a, b) => new Date(a.validFrom).valueOf() - new Date(b.validFrom).valueOf())
for (const lumpSum of iLumpSums) {
const result = await addLumpSumsToCountries(lumpSum.data, new Date(lumpSum.validFrom), 'de')
console.log(
'Lump sum from ' +
lumpSum.validFrom +
': ' +
result.success.length +
' updated - ' +
result.noUpdate.length +
' not updated - ' +
result.noCountryFound.length +
' no country found'
)
for (const notFound of result.noCountryFound) {
console.log(notFound.country)
}
}
}

export async function addLumpSumsToCountries(lumpSums: (typeof iLumpSums)[0]['data'], validFrom: Date, countryNameLanguage = 'de') {
const conditions: any = {}
const noCountryFound = []
const success = []
const noUpdate = []
for (const lumpSum of lumpSums) {
conditions.$or = [{}, {}]
conditions.$or[0]['name.' + countryNameLanguage] = lumpSum.country
conditions.$or[1]['alias.' + countryNameLanguage] = lumpSum.country

const country = await Country.findOne(conditions)
if (country) {
var newData = true
for (const countrylumpSums of country.lumpSums) {
if ((countrylumpSums.validFrom as Date).valueOf() >= validFrom.valueOf()) {
newData = false
break
export async function addLumpSumsToCountries(lumpSumsJSON: LumpSumsJSON) {
lumpSumsJSON.sort((a, b) => a.validFrom - b.validFrom)
for (const lumpSums of lumpSumsJSON) {
for (const lumpSum of lumpSums.data) {
const country = await Country.findOne({ _id: lumpSum.countryCode })
if (country) {
var newData = true
for (const countrylumpSums of country.lumpSums) {
if ((countrylumpSums.validFrom as Date).valueOf() >= lumpSums.validFrom) {
newData = false
break
}
}
if (newData) {
const newLumpSum: CountryLumpSum = Object.assign({ validFrom: new Date(lumpSums.validFrom) }, lumpSum)
country.lumpSums.push(newLumpSum)
country.markModified('lumpSums')
country.save()
}
}
if (newData) {
const newLumpSum: CountryLumpSum = Object.assign({ validFrom }, convertRawLumpSum(lumpSum))
country.lumpSums.push(newLumpSum)
country.markModified('lumpSums')
success.push(await country.save())
} else {
noUpdate.push(country)
throw Error('No Country with id "' + lumpSum.countryCode + '" found')
}
} else {
noCountryFound.push(lumpSum)
}
}
return { success, noUpdate, noCountryFound }
}

function convertRawLumpSum(raw: (typeof iLumpSums)[0]['data'][0]): Omit<CountryLumpSum, 'validFrom'> {
const spezials: CountryLumpSum['spezials'] = []
if (raw.spezials) {
for (const spezial of raw.spezials as { [key: string]: string | string[] }[]) {
spezials.push({
catering24: parseInt(spezial.catering24 as string, 10),
catering8: parseInt(spezial.catering8 as string, 10),
overnight: parseInt(spezial.overnight as string, 10),
city: spezial.city as string
})
}
}

return {
catering24: parseInt(raw.catering24 as string, 10),
catering8: parseInt(raw.catering8 as string, 10),
overnight: parseInt(raw.overnight as string, 10),
spezials
}
}
4 changes: 2 additions & 2 deletions backend/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"test:production": "npm run build && ava --serial dist/build/tests/**/*",
"test:development": "npm run build && ava --serial dist/app/tests/**/*",
"parse-lumpsums": "npm run parse-lumpsums:$NODE_ENV",
"parse-lumpsums:production": "npm run build && node dist/build/data/parser.js",
"parse-lumpsums:development": "npm run build && node dist/app/data/parser.js"
"parse-lumpsums:production": "npm run build && node dist/build/data/run-parser.js",
"parse-lumpsums:development": "npm run build && node dist/app/data/run-parser.js"
}
}
2 changes: 1 addition & 1 deletion lump-sum-update.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
country catering24 catering8 overnight
```

4. run `docker-compose exec -T backend npm parse-lumpsums`
4. run `docker compose exec -T backend npm run parse-lumpsums`

0 comments on commit 4361e5d

Please sign in to comment.