-
Notifications
You must be signed in to change notification settings - Fork 54
/
Copy pathdatabase.jl
executable file
·924 lines (804 loc) · 35.4 KB
/
database.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
@enum CSVType invaliddata namedtupledata singledata pairdata assocdata groupdata
const NO_KIJ = """@REPLACE Clapeyron Database File
no Parameters [csvtype = unlike]
species1,species2,k
"""
const NO_ASSOC = """@REPLACE Clapeyron Database File
no Parameters [csvtype = assoc]
species1,species2,site1,site2,epsilon_assoc,bondvol
"""
include("database_rawparam.jl")
include("database_utils.jl")
"""
params, sites = getparams(components,locations;kwargs...)
returns a `Dict{String,ClapeyronParam}` containing all the parameters found for the list of components
in the available CSVs. `locations` are the locations relative to `Clapeyron` database. the available keywords are the ones used ∈ [`ParamOptions`](@ref)
if `return_sites` is set to false, `getparams` will only return the found params.
## Single to Pair promotion
When reading multiple CSVs, if a parameter name appears in a single paramter file and in a pair parameter file, the single parameter values will be promoted to be the diagonal values of the pair interaction matrix:
**`my_parameter_single.csv`**
```
Clapeyron Database File
like parameters
species,a
sp1,1000
sp2,700
sp3,850
```
**`my_parameter_pair.csv`**
```
Clapeyron Database File
pair parameters
species1,species2,a
sp1,sp2,875
sp2,sp3,792
sp3,sp1,960
julia> res = getparams(["sp1","sp2"],userlocations = [my_parameter_single.csv,my_parameter_pair.csv])
Dict{String, Clapeyron.ClapeyronParam} with 1 entry:
"a" => PairParam{Int64}("a")["sp1", "sp2"]
julia> res["a"].values
2×2 Matrix{Int64}:
1000 875
875 700
```
This promotion fails only happens in Single-Pair combinations. it fails otherwise.
## In-memory CSV parsing
if you pass any string starting with `Clapeyron Database File`, it will be parsed as a CSV instead of being used as a filepath:
```julia
julia> x = \"\"\"Clapeyron Database File,
in memory like parameters
species,a,b
sp1,1000,0.05
sp2,700,0.41
\"\"\"
"Clapeyron Database File,\nin memory parameters [csvtype = like,grouptype = in_memory_read]\nspecies,a,b\nsp1,1000,0.05\nsp2,700,0.41\n"
julia> Clapeyron.getparams(["sp1","sp2"],userlocations = [x])
Dict{String, Clapeyron.ClapeyronParam} with 2 entries:
"b" => SingleParam{Float64}("b")["sp1", "sp2"]
"a" => SingleParam{Int64}("a")["sp1", "sp2"]
```
## Special prefixes
There are some special prefixes that are used by the parser to signal some specific behaviour to be done at parsing time, for one CSV or a group of them:
- `@DB`: replaces the path by the current Clapeyron default database. When doing `getparams(components,["location"])`, the paths are lowered to `getparams(components,userlocations = ["@DB/location"])`.
In a way, is a path shortcut used internally by Clapeyron to parse it's own database. you can change the path where `@DB` points to (or add other path shortcuts), via adding a corresponding entry to the `Clapeyron.SHORT_PATHS` Dict.
- `@REPLACE`: Any filepath starting with `@REPLACE` will clear all previous appearances of the parameter names found in the CSV that contains the prefix.
- `@REMOVEDEFAULTS`: it is used alone, and needs to be passed at the first position of the vector of `userlocations`. it will skip parsing of the default parameters:
The effect of the the parser can be summarized by the following examples:
```
model = PCSAFT(["water"],userlocations = ["@REMOVEDEFAULTS"]) #fails, no parameters found, no CSV parsed
model = PCSAFT(["water"],userlocations = ["@REPLACE/empty_params.csv"]) #fails, no parameters found, default parameters parsed and then removed
model = PCSAFT(["water"],userlocations = ["@REPLACE/my_pcsaft_kij.csv"]) #success, default kij parameters replaced by the ones on `my_pcsaft_kij.csv`
model = PCSAFT(["water"],userlocations = ["@REMOVEDEFAULTS","@DB/SAFT/PCSAFT","@DB/properties/molarmass.csv"]) #sucess. default parameters csv removed, and parsed again, using the @DB prefix to point to the default database.
```
You can use the `@REPLACE` keyword in a in-memory CSV by adding it at the start of the string, followed by an space:
```
#This will replace all previous parsed occurences of `a` and `b`
x_replace = \"\"\"@REPLACE Clapeyron Database File,
in memory like parameters
species,a,b
sp1,1000,0.05
sp2,700,0.41
\"\"\"
```
## CSV type detection and group type
The second line of the csv is used for comments and to identify the type of CSV used. for example:
```
x = \"\"\"Clapeyron Database File
in memory like parameters
species,a,b
sp1,1000,0.05
sp2,700,0.41
\"\"\"
```
Will be parsed as a table with single parameter data. if you want more flexibility, you can instead pass the csvtype between brackets:
```
x = \"\"\"Clapeyron Database File
i can write anything here, unlike, association [csvtype = like] but the csv type is already specified.
species,a,b
sp1,1000,0.05
sp2,700,0.41
\"\"\"
```
additionaly, there are some cases when you want to absolutely sure that your types don't clash with the default values. this is the case with different group parametrizations of UNIFAC (Dormund, VTPR, PSRK):
```
julia> model = UNIFAC(["methanol","ethanol"])
UNIFAC{PR{BasicIdeal, PRAlpha, NoTranslation, vdW1fRule}} with 2 components:
"methanol": "CH3OH" => 1
"ethanol": "CH2" => 1, "CH3" => 1, "OH (P)" => 1
Group Type: UNIFACDortmund
Contains parameters: A, B, C, R, Q
julia> model = PSRKUNIFAC(["methanol","ethanol"])
UNIFAC{BasicIdeal} with 2 components:
"methanol": "CH3OH" => 1
"ethanol": "CH2" => 1, "CH3" => 1, "OH" => 1
Group Type: PSRK
Contains parameters: A, B, C, R, Q
```
The models are the same (`UNIFAC`), but the group parametrizations are different. this is specified with the `grouptype` keyword. for example, if we see `UNIFAC_groups.csv`, it starts with:
```
Clapeyron Database File,
modified UNIFAC (Dortmund) Groups [csvtype = groups,grouptype = UNIFACDortmund]
species,groups
ethane,"[""CH3"" => 2]"
propane,"[""CH3"" => 2, ""CH2"" => 1]"
butane,"[""CH3"" => 2, ""CH2"" => 2]"
...
```
For compatibility reasons, if you pass a CSV without grouptype, it will be accepted, but two CSV with different specified group types cannot be merged:
```
x1 = \"\"\"Clapeyron Database File
paramterization 1 [csvtype = like,grouptype = param1]
species,a,b
sp1,1000,0.05
sp2,700,0.41
\"\"\"
x2 = \"\"\"Clapeyron Database File
fitted to data [csvtype = like,grouptype = fitted]
species,a,b
sp1,912,0.067
sp2,616,0.432
\"\"\"
```
If we pass the same parameters, with different group types, the parser will fail
```julia-repl
julia> Clapeyron.getparams(["sp1","sp2"],userlocations = [x1,x2])
ERROR: cannot join two databases with different group types:
current group type: param1
incoming group type: fitted
```
Note, that the parser will not fail if you pass different parameters with different group types (For example if `a` has `param1` group type and `b` has `fit` group type)
"""
function getparams(components,
locations::Array{String,1}=String[];
userlocations = String[],
asymmetricparams::Vector{String}=String[],
ignore_missing_singleparams::Vector{String}=String[],
ignore_headers::Vector{String} = IGNORE_HEADERS,
verbose::Bool=false,
species_columnreference::String="species",
source_columnreference::String="source",
site_columnreference::String="site",
normalisecomponents::Bool=true,
return_sites::Bool = true,
component_delimiter::String = "~|~"
)
options = ParamOptions(;userlocations,
asymmetricparams,
ignore_missing_singleparams,
ignore_headers,
verbose,
species_columnreference,
source_columnreference,
site_columnreference,
normalisecomponents,
return_sites,
component_delimiter)
# locations is a list of paths relative to the Clapeyron database directory.
# userlocations is a list of paths input by the user.
# If parameters exist ∈ multiple files, Clapeyron gives priority to files ∈ later paths.
# asymmetricparams is a list of parameters for which matrix reflection is disabled.
# ignore_missingsingleparams gives users the option to disable component existence check ∈ single params.
return getparams(format_components(components),locations,options)
end
function getparams(components::Vector{String},locations::Vector{String},options::ParamOptions)
#generate one string of params
filepaths = flattenfilepaths(locations,options.userlocations)
#merge all found params
allparams,allnotfoundparams = createparams(components, filepaths, options)
#generate sites, if any
sites = buildsites(components,allparams,allnotfoundparams,options)# Union{SiteParam,Nothing}
#generate ClapeyronParams
result = compile_params(components,allparams,allnotfoundparams,sites,options)
#check values
for v ∈ values(result)
is_valid_param(v,options)
end
return result
end
function buildsites(components,allparams,allnotfoundparams,options)
#if we are asked to don't build sites, do nothing
options.return_sites || return nothing
#if there aren't any assoc data files, return empty SiteParam
assoc_data_found = any(x -> x.type == assocdata,values(allparams))
assoc_data_notfound = any(x -> x == assocdata,values(allnotfoundparams))
!assoc_data_found && !assoc_data_notfound && return nothing
#Find if there is any sites in the assoc files, do nothing if data not found
anysites(allparams,components) || return SiteParam(components)
#Find the names of all possible sites for each component.
allcomponentsites = findsites(allparams,components)
#Unique sites
v = String[]
for sitei ∈ allcomponentsites
append!(v,sitei)
end
unique!(v)
@assert length(v) != 0 #this should not be false, we already checked with anysites
#=
build our own dict, by using the transformation X => n_X
=#
if isempty(options.n_sites_columns)
n_sites_columns = Dict{String,String}(vi => string("n_",vi) for vi in v)
else
n_sites_columns = options.n_sites_columns
end
#check for missing number of sites
nc = length(components)
n_sites_dict = Dict{String,Vector{Int}}()
for vi ∈ v
ki = n_sites_columns[vi]
if haskey(allparams,ki)
n_sites_dict[vi] = compile_single_vec(components,allparams[ki])
else
options.verbose && @warn("no columns found containing number of sites of type $(error_color(vi)). supposing zero sites")
n_sites_dict[vi] = zeros(Int,nc)
end
end
#we compile directly the existing params into a vector. in this way, incomplete sites get assigned the value of zero.
#n_sites_dict = Dict{String,Vector{Int}}(vi => compile_single_vec(components,allparams[n_sites_columns[vi]]) for vi ∈ v)
n_sites = [[n_sites_dict[allcomponentsites[i][j]][i] for j ∈ 1:length(allcomponentsites[i])] for i ∈ 1:nc] # or groupsites
sourcecsvs = String[]
for vi ∈ v
ki = n_sites_columns[vi]
if haskey(allparams,ki)
csv_vi = allparams[ki].csv
csv_vi != nothing && append!(sourcecsvs,csv_vi)
#remove single params used in sites, those were already consumed
delete!(allparams,ki)
end
end
unique!(sourcecsvs)
res = SiteParam(components,allcomponentsites,n_sites,sourcecsvs)
return res
end
function getparams(groups::GroupParameter, locations::Vector{String}=String[],options::ParamOptions=DefaultOptions)
return getparams(groups.flattenedgroups, locations,options)
end
function anysites(data,components)
for raw ∈ values(data)
if raw.type === assocdata
if length(raw.component_info) != 0
return true
end
end
end
return false
end
function findsites(data::Dict,components::Vector;verbose = false)
sites = Dict(components .=> [Set{String}() for _ ∈ 1:length(components)])
for raw ∈ values(data)
if raw.type === assocdata
for (c1,c2,s1,s2) ∈ raw.component_info
push!(sites[c1], s1)
push!(sites[c2], s2)
end
end
end
output = Array{Array{String,1}}(undef, 0)
for component ∈ components
push!(output, collect(sites[component]))
end
verbose && @info("Found sites for $components are $(output).")
return output
end
#hooks to transform arbitrary data formats into namedtuples or dicts
to_nt(x) = x
#hook to check if a struct can be transformed into a named tuple
can_nt(x) = false
can_nt(x::AbstractDict) = true
can_nt(x::NamedTuple) = true
@nospecialize
function createparams(components::Vector{String},
filepaths::Vector{String},
options::ParamOptions = DefaultOptions,
parsegroups = :off)
allparams = Dict{String,RawParam}()
allnotfoundparams = Dict{String,CSVType}()
#in case of NamedTuple or Dict user-provided params, the filepath string should be empty.
#but if its not, parse those anyway.
for filepath ∈ filepaths
_replace = startswith(filepath,"@REPLACE")
if _replace
filepath = chop(filepath,head = 9, tail = 0)
end
csv_options = read_csv_options(filepath)
csvtype = csv_options.csvtype
if csvtype == groupdata && parsegroups != :group
continue
end
if csvtype == assocdata && !options.return_sites
if options.verbose
__verbose_findparams_skipassoc(filepath)
end
continue
end
if csvtype == invaliddata
if options.verbose
__verbose_findparams_invaliddata(filepath)
end
continue
end
foundparams, notfoundparams = findparamsincsv(components,filepath,options,parsegroups,csv_options)
merge_allparams!(allparams,allnotfoundparams,foundparams,notfoundparams,_replace)
end
if can_nt(options.userlocations)
foundparams, notfoundparams = findparamsinnt(components,options,parsegroups,NT_CSV_OPTIONS)
merge_allparams!(allparams,allnotfoundparams,foundparams,notfoundparams,false)
end
#clean not found params.
for (kk,vv) ∈ allparams
delete!(allnotfoundparams,kk)
end
return allparams,allnotfoundparams
end
#helper function, merges params into the main list
function merge_allparams!(allparams,allnotfoundparams,foundparams,notfoundparams,_replace)
for vv ∈ foundparams
kk = vv.name
#we merge if the filepath is not set to replace the current values
if haskey(allparams,kk) && !_replace
vv2 = allparams[kk]
vv = joindata!(vv2,vv)
end
allparams[kk] = vv
end
#Merge not found data
for (kk,vv) ∈ pairs(notfoundparams)
if haskey(allnotfoundparams,kk)
vv2 = allnotfoundparams[kk]
vv, success = joindata!(vv2,vv)
!success && error_clashing_headers(vv2,vv,kk) #Clashing headers error
end
allnotfoundparams[kk] = vv
end
if _replace #if the paramter is not found, that means that we want to erase that param.
for (kk,vv) ∈ pairs(notfoundparams)
delete!(allparams,kk)
end
end
return nothing
end
@specialize
function compile_params(components,allparams,allnotfoundparams,sites,options)
#Compile Params
result = Dict{String,ClapeyronParam}()
for (k,v) ∈ allparams
if !(v.type == assocdata && !options.return_sites)
result[k] = compile_param(components,k,v,sites,options)
end
end
for (kk,vv) ∈ allnotfoundparams
result[kk] = compile_param(components,kk,vv,sites,options)
end
#add missing single params, if not in the input databases.
for prop in options.ignore_missing_singleparams
get!(result,prop) do
options.verbose && __verbose_missing_singleparams_added(prop)
SingleParam(prop,components)
end
end
if sites != nothing && options.return_sites
haskey(result,"sites") && throw(error("cannot overwrite \"sites\" key, already exists!"))
result["sites"] = sites
end
return result
end
@noinline function __verbose_missing_singleparams_added(prop)
color_prop = info_color(prop)
@info "Optional Single Parameter $color_prop not found in any databases. Adding empty placeholder"
end
@noinline function _col_indices_error(header)
throw(error("Header ", header, " not found."))
end
function col_indices(csvtype,headernames,options=DefaultOptions)
columnreference = options.species_columnreference
normalised_columnreference = normalisestring(columnreference)
idx_species = 0
idx_species1 = 0
idx_species2 = 0
idx_sites1 = 0
idx_sites2 = 0
if csvtype === singledata || csvtype == groupdata
lookupcolumnindex = findfirst(isequal(normalised_columnreference), headernames)
isnothing(lookupcolumnindex) && _col_indices_error(normalised_columnreference)
idx_species = lookupcolumnindex
#=
if csvtype == groupdata
groupcolumnreference = options.group_columnreference
normalised_groupcolumnreference = normalisestring(groupcolumnreference)
lookup_group_columnindex = findfirst(isequal(normalised_groupcolumnreference), headernames)
lookup_intragroup_columnindex = findfirst(isequal("intragroups"), headernames)
if isnothing(lookup_group_columnindex)
_col_indices_error(normalised_groupcolumnreference)
end
idx_groups = lookupgroupcolumnindex
else
idx_groups = 0
end =#
elseif csvtype === pairdata || csvtype == assocdata
normalised_columnreference1 = normalised_columnreference * '1'
normalised_columnreference2 = normalised_columnreference * '2'
lookupcolumnindex1 = findfirst(isequal(normalised_columnreference1), headernames)
lookupcolumnindex2 = findfirst(isequal(normalised_columnreference2), headernames)
isnothing(lookupcolumnindex1) && _col_indices_error(normalised_columnreference1)
isnothing(lookupcolumnindex2) && _col_indices_error(normalised_columnreference2)
idx_species1 = lookupcolumnindex1
idx_species2 = lookupcolumnindex2
if csvtype == assocdata
sitecolumnreference = options.site_columnreference
normalised_sitecolumnreference = normalisestring(sitecolumnreference)
normalised_sitecolumnreference1 = normalised_sitecolumnreference * '1'
normalised_sitecolumnreference2 = normalised_sitecolumnreference * '2'
lookupsitecolumnindex1 = findfirst(isequal(normalised_sitecolumnreference1), headernames)
lookupsitecolumnindex2 = findfirst(isequal(normalised_sitecolumnreference2), headernames)
isnothing(lookupsitecolumnindex1) && _col_indices_error(normalised_sitecolumnreference1)
isnothing(lookupsitecolumnindex2) && _col_indices_error(normalised_sitecolumnreference2)
idx_sites1 = lookupsitecolumnindex1
idx_sites2 = lookupsitecolumnindex2
end
end
_single = idx_species
_pair = (idx_species1,idx_species2)
_assoc = (idx_sites1,idx_sites2)
return (_single,_pair,_assoc)
end
function read_csv(filepath,options::ParamOptions,sep = :auto)::CSV.File
#actual reading
ignorelist = deepcopy(options.ignore_headers)
map!(normalisestring,ignorelist,ignorelist)
push!(ignorelist,"column") #autogenerated name by CSV.jl
function _drop(i,name)
norm_header = normalisestring(string(name))
normalisestring(norm_header; tofilter=r"[ \-\_\d]") ∈ ignorelist
end
if sep == :auto
sep = read_csv_options(filepath)[:sep]
end
_delims = (comma = ',',space = ' ')
if sep isa Symbol
_delim = get(_delims,sep,string(sep))
else
_delim = sep
end
if is_inline_csv(filepath)
df = CSV.File(IOBuffer(filepath); header=3, pool=0,silencewarnings=true,drop = _drop, stringtype = String, delim = _delim, ntasks = 1,buffer_in_memory = true)
else
df = CSV.File(filepath; header=3, pool=0,silencewarnings=true,drop = _drop, stringtype = String,delim = _delim, ntasks = 1)
end
return df
end
function read_csv(path;relativetodatabase = true)
path_norm = getpath(path;relativetodatabase)
return read_csv(path_norm,ParamOptions(ignore_headers = String[]))
end
function findparamsincsv(components,filepath,
options::ParamOptions = DefaultOptions,
parsegroups = :off,
csv_file_options = read_csv_options(filepath) #we do a preliminar reading of the CSV here
)
sourcecolumnreference = options.source_columnreference
verbose = options.verbose
normalisecomponents = options.normalisecomponents
component_delimiter = options.component_delimiter
csvtype = csv_file_options.csvtype
no_parsegroups = parsegroups == :off
correct_group = (parsegroups == :group && csvtype == groupdata)
grouptype = csv_file_options.grouptype
sep = get(csv_file_options,:sep,:comma)
df = read_csv(filepath,options,sep)
csvheaders = String.(Tables.columnnames(df))
headerparams = valid_headerparams(csvheaders,options) #removes all ignored header params
normalised_components = normalisestring.(components,normalisecomponents)
components_dict = Dict(v => k for (k,v) ∈ pairs(normalised_components))
normalised_csvheaders = normalisestring.(csvheaders)
normalised_headerparams = normalisestring.(headerparams)
if normalised_headerparams ⊈ normalised_csvheaders
error("Headers ", setdiff(normalised_headerparams, normalised_csvheaders), " not present ∈ csv header.")
end
#function output
foundvalues = Vector{RawParam}(undef,0)
notfoundvalues = Dict{String,CSVType}(strip(headerparam) => csvtype for headerparam ∈ headerparams)
normalised_sourcecolumnreference = normalisestring(sourcecolumnreference)
getsources = false
if normalised_sourcecolumnreference ∈ normalised_csvheaders
getsources = true
sourcecolumn = findfirst(isequal(normalised_sourcecolumnreference), normalised_csvheaders)::Int
else
sourcecolumn = 0
end
single_idx,pair_idx,assoc_idx = col_indices(csvtype,normalised_csvheaders,options)
lookupcolumnindex = single_idx
lookupcolumnindex1,lookupcolumnindex2 = pair_idx
lookupsitecolumnindex1,lookupsitecolumnindex2 = assoc_idx
headerparams_indices = zeros(Int,length(normalised_headerparams))
map!(i -> findfirst(isequal(i),normalised_csvheaders)::Int,headerparams_indices,normalised_headerparams)
#headerparams_indices = [findfirst(isequal(i),normalised_csvheaders) for i ∈ normalised_headerparams]
lookupcolumnindex = max(lookupcolumnindex,lookupcolumnindex1)
verbose && __verbose_findparams_start(filepath,components,headerparams,parsegroups,csvtype,grouptype)
#list of all species
species_list::Vector{String} = normalisestring.(Tables.getcolumn(df,lookupcolumnindex),normalisecomponents)
#indices where data could be (they could be missing)
#on pair and assoc, this is just the first component, we need to reduce the valid indices again
found_indices0,comp_indices = _indexin(components_dict,species_list,component_delimiter,1:length(species_list))
dfR = df
EMPTY_STR = ""
if csvtype == singledata || correct_group
found_indices = found_indices0
l = length(found_indices)
_data = dfR[found_indices]
_sources = fill(EMPTY_STR,l)
_csv = fill(filepath,l)
_comp = Vector{NTuple{4,String}}(undef,l)
for li ∈ 1:l
_c = components[comp_indices[li]]
_comp[li] = (_c,EMPTY_STR,EMPTY_STR,EMPTY_STR)
end
elseif csvtype == pairdata && no_parsegroups
species2_list::Vector{String} = normalisestring.(Tables.getcolumn(df,lookupcolumnindex2)[found_indices0],normalisecomponents)
found_indices2,comp_indices2 = _indexin(components_dict,species2_list,component_delimiter,1:length(species2_list))
comp_indices1 = comp_indices[found_indices2]
found_indices2 = found_indices0[found_indices2]
l = length(found_indices2)
_data = dfR[found_indices2]
_comp = [(components[c1],components[c2],EMPTY_STR,EMPTY_STR) for (c1,c2) ∈ zip(comp_indices1,comp_indices2)]
_sources = fill(EMPTY_STR,l)
_csv = fill(filepath,l)
elseif csvtype == assocdata && no_parsegroups
species2_list = normalisestring.(Tables.getcolumn(df,lookupcolumnindex2)[found_indices0],normalisecomponents)
found_indices2,comp_indices2 = _indexin(components_dict,species2_list,component_delimiter,1:length(species2_list))
comp_indices1 = comp_indices[found_indices2]
found_indices2 = found_indices0[found_indices2]
l = length(found_indices2)
_data = dfR[found_indices2]
_site1 = Vector{String}(undef,l)
_site2 = similar(_site1)
_comp = Vector{NTuple{4,String}}(undef,l)
for li ∈ 1:l
_s1 = _data[li][lookupsitecolumnindex1]
_s2 = _data[li][lookupsitecolumnindex2]
_c1 = components[comp_indices1[li]]
_c2 = components[comp_indices2[li]]
_site1[li] = _s1
_site2[li] = _s2
_comp[li] = (_c1,_c2,_s1,_s2)
end
_sources = fill(EMPTY_STR,l)
_csv = fill(filepath,l)
elseif csvtype == groupdata && no_parsegroups
return foundvalues, notfoundvalues
else
error("Filepath $filepath is of type ", string(csvtype), " and cannot be read with this function.")
end
#if getsources, then we actually put the sources ∈ inside the preallocated _sources vector
if getsources
_fill_sources!(_sources,getindex.(_data,sourcecolumn),EMPTY_STR)
end
#with the raw data preallocated, we now store it ∈ a RawParam.
for (headerparam,idx) ∈ zip(headerparams,headerparams_indices)
_vals = getindex.(_data,idx)
raw::Any = build_raw_param(headerparam,_comp,_vals,_sources,_csv,csvtype,grouptype)
if !iszero(length(raw))
push!(foundvalues,raw)
end
end
#store all headers that didn't had a result.
for rawparam ∈ foundvalues
delete!(notfoundvalues,rawparam.name)
end
verbose && __verbose_findparams_found(foundvalues) #print all found values
verbose && __verbose_findparams_not_found(notfoundvalues) #print all found values
return foundvalues, notfoundvalues
end
#find params in named tuple, transforms form named tuple to Dict{RawParam}
function findparamsinnt(components,
options::ParamOptions,
parsegroups = :off,
csv_file_options = NT_CSV_OPTIONS) #default options
verbose = options.verbose
nt = to_nt(options.userlocations)
foundvalues = Vector{RawParam}(undef,0)
notfoundvalues = Dict{String,CSVType}()
#this algorithm is less strict that what we have in CSVs. but allows us to parse named tuples
for (k,v) in pairs(nt)
ks = string(k)
if k == :groups && parsegroups == :groups
param = RawParam(ks,nothing,copy(v),nothing,nothing,groupdata,:unknown)
push!(foundvalues,param)
elseif (k == :epsilon_assoc || k == :bondvol) && parsegroups == :off && v === nothing #TODO: what to do here in case of other assoc names?
notfoundvalues[ks] = assocdata
elseif v isa Vector && parsegroups == :off
param = RawParam(ks,nothing,copy(v),nothing,nothing,singledata,:unknown)
push!(foundvalues,param)
elseif v isa Matrix && parsegroups == :off
param = RawParam(ks,nothing,vec(copy(v)),nothing,nothing,pairdata,:unknown)
push!(foundvalues,param)
elseif v isa Number && parsegroups == :off && length(components) == 1
param = RawParam(ks,nothing,[v],nothing,nothing,singledata,:unknown)
push!(foundvalues,param)
elseif v isa Dict{Tuple{Tuple{String,String},Tuple{String,String}}}
param = RawParam(ks,Vector{NTuple{4,String}}(undef,0),Vector{valtype(v)}(undef,0),String[],String[],assocdata,:unknown)
empty_string = ""
for (k_dict,v_dict) in pairs(v)
sp1,s1 = first(k_dict)
sp2,s2 = last(k_dict)
push!(param.component_info,(sp1,sp2,s1,s2))
push!(param.data,v_dict)
push!(param.sources,empty_string)
push!(param.csv,empty_string)
push!(foundvalues,param)
end
else
throw(error("cannot parse combination key = $k, value = $v as a valid parameter."))
end
end
# verbose && __verbose_findparams_found(foundvalues) #print all found values
return foundvalues, notfoundvalues
end
function _fill_sources!(input,allsources,tofill)
for i in eachindex(input)
input[i] = coalesce(allsources[i],tofill)
end
return input
end
function build_raw_param(name,comps,vals,sources,csv,csvtype,grouptype)
s::Vector{Int} = findall(!ismissing,vals)
ls = length(s)
_vals = Vector{nonmissingtype(eltype(vals))}(undef,ls)
_sources = Vector{String}(undef,ls)
_comps = similar(comps,ls)
_csv = Vector{String}(undef,ls)
for (i,j) ∈ pairs(s)
_comps[i] = comps[j]
_vals[i] = vals[j]
_sources[i] = sources[j]
_csv[i] = csv[j]
end
return RawParam(string(strip(name)),_comps,_vals,_sources,_csv,csvtype,grouptype)
end
#verbose functionality, is executed for each csv when verbose == true
function __verbose_findparams_invaliddata(filepath)
@warn "Skipping $filepath, cannot infer correct csv type. Check line 2 of the CSV to see if it has valid information."
end
function __verbose_findparams_skipassoc(filepath)
@warn "Skipping association file $filepath, the option return_sites was set to false."
end
function __assoc_string(pair)
"($(pair[1]),$(pair[3])) >=< ($(pair[2]), $(pair[4]))"
end
function __verbose_findparams_start(filepath,components,headerparams,parsegroups,csvtype,grouptype)
csv_string = Symbol(csvtype)
no_parsegroups = parsegroups == :off
if no_parsegroups
if csvtype == groupdata
@info("Skipping $csv_string csv $filepath")
else
@info("Searching for $csv_string headers $headerparams for query $components at $filepath ...")
end
else
if csvtype == groupdata
@info("Searching for groups for components $components at $filepath ...")
else
@info("Skipping $csv_string csv $filepath")
end
end
if grouptype != :unknown
@info("group type: $grouptype")
end
end
function __verbose_findparams_not_found(notfoundvalues)
for k ∈ keys(notfoundvalues)
kk = info_color(k)
@info("No data found for $kk")
end
end
function __verbose_findparams_found(foundvalues)
for v ∈ foundvalues
if v.type == singledata
io = IOBuffer()
show_pairs(io,first.(v.component_info),v.data," => ",quote_string = false)
vals = String(take!(io))
kk = info_color(v.name)
@info("""Found single component data: $kk with values:
$vals
""")
elseif v.type == pairdata
io = IOBuffer()
show_pairs(io,first.(v.component_info,2),v.data," => ",quote_string = false)
vals = String(take!(io))
kk = info_color(v.name)
@info("""Found pair component data: $kk with values:
$vals
""")
elseif v.type == assocdata
io = IOBuffer()
show_pairs(io,__assoc_string.(v.component_info),v.data," => ",quote_string = false)
vals = String(take!(io))
kk = info_color(v.name)
@info("""Found association component data: $kk with values:
$vals
""")
elseif v.type == groupdata
#println(val) for val ∈ v.data)
vals = "Dict("
for (pair,val) ∈ zip(v.component_info,v.data)
pairi = pair[1] * " => " * string(val)
vals = vals * pairi * ", "
end
vals = chop(vals,tail=2) *")"
@info("""Found group data:
$vals
""")
#@info("TODO: parse intragroup data for debug")
end
end
end
const readcsvtype_keywords = ["like", "single", "unlike", "pair", "assoc", "association", "group", "groups","intragroup","intragroups"]
function read_csv_options(filepath)
return _read_csv_options(getline(String(filepath), 2))
end
function _read_csv_options(line::String)
re = r"\[.*\]"
maybe_opts = match(re,line)
# Searches for type from second line of CSV.
has_csv_options = !isnothing(maybe_opts)
if has_csv_options
opts = chop(maybe_opts.match,head = 1,tail = 1)
return __get_options(opts)
else
keywords = readcsvtype_keywords
words = split(lowercase(strip(line, ',')), ' ')
foundkeywords = intersect(words, keywords)
_species = intersect(words,["species"])
_estimator = intersect(words,["method"])
return (csvtype = _readcsvtype(foundkeywords),grouptype = :unknown,estimator = _estimator, species = _species,sep = :comma)
end
end
const NT_CSV_OPTIONS = (csvtype = namedtupledata,grouptype = :unknown,estimator = :no_estimator, species = ["all"],sep = :comma)
+
function _readcsvtype(collection)
length(collection) != 1 && return invaliddata
key = only(collection)
return _readcsvtype(key)
end
function _readcsvtype(key::AbstractString)
key == "single" && return singledata
key == "like" && return singledata
key == "pair" && return pairdata
key == "unlike" && return pairdata
key == "assoc" && return assocdata
key == "group" && return groupdata
key == "groups" && return groupdata
key == "intragroup" && return groupdata
key == "intragroups" && return groupdata
key == "invalid" && return invaliddata
return invaliddata
end
function __get_options(data)
opts = eachsplit(data,',')
opts_dict = Dict{String,String}()
for opt in opts
k,v = _parse_kv(opt,"=")
opts_dict[k] = v
end
_csvtype = _readcsvtype(get(opts_dict,"csvtype","invalid"))
_grouptype = Symbol(get(opts_dict,"grouptype","unknown"))
_estimator = Symbol(get(opts_dict,"method","error"))
_estimator = Symbol(get(opts_dict,"method","error"))
_species = String.(split(get(opts_dict,"species","all")," "))
_sep = Symbol(get(opts_dict,"sep","comma"))
return (csvtype = _csvtype,grouptype = _grouptype,estimator = _estimator, species = _species, sep = _sep)
end
function valid_headerparams(csvheaders, options::ParamOptions = DefaultOptions)
ignorelist = deepcopy(options.ignore_headers)
push!(ignorelist,options.species_columnreference)
push!(ignorelist,options.source_columnreference)
push!(ignorelist,options.site_columnreference)
map!(normalisestring,ignorelist,ignorelist)
result = filter(csvheaders) do header
norm_header = normalisestring(header)
#that regex allows to ignore things like "species1" or "SMILES1"
normalisestring(norm_header; tofilter=r"[ \-\_\d]") ∉ ignorelist
end
end
include("database_group.jl")
include("database_writer.jl")
export getparams