-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnstandr.src.org
11963 lines (10806 loc) · 699 KB
/
nstandr.src.org
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#+title: Name Standardization in R
#+author: Stas Vlasov
#+email: [email protected]
#+r-pkg-version: 0.0.0.9000
#+r-pkg-url: https://stasvlasov.github.io/nstandr/
#+r-pkg-bug-reports: https://github.com/stasvlasov/nstandr/issues
#+PROPERTY: header-args:R :comments link :session :mkdirp yes :padline yes
# - Reference from [[associate-id:org:29i4s7t0y5i0][nstandr R package]] on [2021-06-05 Sat 22:51]
* Frontpage
:PROPERTIES:
:export_file_name: README.md
:export_options: toc:nil
:export_options+: author:nil
:export_options+: title:nil
:END:
#+begin_export markdown
# nstandr <img src="img/logo.png" align="right" alt="" width="120"/>
#+end_export
#+begin: md-badges :codecov-token OQVJ7NRXO5
#+begin_export markdown
[![R-CMD-check](https://github.com/stasvlasov/nstandr/workflows/R-CMD-check/badge.svg)](https://github.com/stasvlasov/nstandr/actions)
[![codecov](https://codecov.io/gh/stasvlasov/nstandr/branch/master/graph/badge.svg?token=OQVJ7NRXO5)](https://codecov.io/gh/stasvlasov/nstandr)
![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/stasvlasov/nstandr)
#+end_export
#+end:
#+name: r-pkg-description
#+begin_src org :exports results :results replace
(Organizational) Names STANDardization in R. Reproduces procedures described in Thoma et al. (2010), Magerman et al. (2006), Cockburn et al. (2009), Wasi & Flaaen (2015) and more.
#+end_src
#+RESULTS: r-pkg-description
A package that does (Organizational) Names STANDardization in R.
=nstandr= reproduces procedures described in Thoma et al. (2010), Magerman et al. (2006), Cockburn et al. (2009), Wasi & Flaaen (2015) and more.
*** Installation
#+BEGIN_SRC R
devtools::install_github("stasvlasov/nstandr")
#+END_SRC
*** Usage
The package provides its main function =standardize=. The function expect character vector of organization names as input and returns its standardized version.
For the standardization methods described in Magerman et al. (2006) and Cockburn et al. (2009) you can use =standardize_magerman= and =standardize_cockburn= respectively. These functions are similar to =standardize(x, procedures=nstandr:::magerman_procedures_list))= and =standardize(x, procedures=nstandr:::cockburn_procedures_list))= but with additional options for tweaking original procedures and with more documentation.
Here is an example of =standardize_magerman= usage
#+BEGIN_SRC R
textConnection("SGS-THOMSON MICROELECTRONICS
S.G.S. THOMSON MICROELECTRONICS S.R.L.
S.G.S. THOMSON MICROELECTRONICS, S.R.L.
S.G.S.-THOMSON MICROELECTRONICS S.R.L.
SGS - THOMSON MICROELECTRONICS S.A.
SGS - THOMSON MICROELECTRONICS S.R.L.
SGS - THOMSON MICROELECTRONICS, INC.
SGS - THOMSON MICROELECTRONICS, S.R.L.
SGS THOMSON MICROELECTRONICS S.A.
SGS THOMSON MICROELECTRONICS S.R.L.
SGS THOMSON MICROELECTRONICS SA
SGS THOMSON MICROELECTRONICS SRL
SGS THOMSON MICROELECTRONICS, INC.
SGS THOMSON MICROELECTRONICS, S.A.
SGS- THOMSON MICROELECTRONICS, S.A.
SGS THOMSON MICROELECTRONICS, S.R.L.
SGS- THOMSON MICROELECTRONICS<BR>(PTE) LTD.
SGS THOMSON-MICROELECTRONICS SA
SGS-THOMSON MICROELECTRONIC S.A.
SGS-THOMSON MICROELECTRONICS
SGS-THOMSON MICROELECTRONICS GMBH
SGS-THOMSON MICROELECTRONICS INC.
SGS-THOMSON MICROELECTRONICS LIMITED
SGS-THOMSON MICROELECTRONICS LTD.
SGS-THOMSON MICROELECTRONICS PTE LTD
SGS-THOMSON MICROELECTRONICS PTE LTD.
SGS-THOMSON MICROELECTRONICS PTE. LIMITED
SGS-THOMSON MICROELECTRONICS PTE. LTD.
SGS-THOMSON MICROELECTRONICS S. R. L.
SGS-THOMSON MICROELECTRONICS S.A
SGS-THOMSON MICROELECTRONICS S.A.
SGS-THOMSON MICROELECTRONICS S.P.A.
SGS-THOMSON MICROELECTRONICS S.R. L.
SGS-THOMSON MICROELECTRONICS S.R.L
SGS-THOMSON MICROELECTRONICS S.R.L.
SGS--THOMSON MICROELECTRONICS S.R.L.
SGS-THOMSON MICROELECTRONICS SA
SGS-THOMSON MICROELECTRONICS SPA
SGS-THOMSON MICROELECTRONICS SRL
SGS-THOMSON MICROELECTRONICS SRL.
SGS-THOMSON MICROELECTRONICS, GMBH
SGS-THOMSON MICROELECTRONICS, INC
SGS-THOMSON MICROELECTRONICS, INC.
SGS-THOMSON MICROELECTRONICS, LTD.
SGS-THOMSON MICROELECTRONICS, PTE LTD.
SGS-THOMSON MICROELECTRONICS, S.A.
SGS-THOMSON MICROELECTRONICS, S.R.L.
SGS-THOMSON MICROELECTRONICS, S.RL
SGS-THOMSON MICROELECTRONICS, SA
SGS-THOMSON MICROELECTRONICS, SA.
SGS-THOMSON MICROELECTRONICS, SRL
SGS-THOMSON MICROELECTRONICS,S.R.L.") |>
readLines() |>
standardize_magerman(output_placement = "append_to_x")
#
# Applying standardization procedures:
# -----------------------------------------------------------------
#
# * Upper casing DONE
# * Cleaning spaces DONE
# * Removing HTML codes DONE
# * Cleaning spaces (2) DONE
# * Replacing SGML coded characters DONE
# * Replacing proprietary characters DONE
# * Detecting Umlauts DONE
# * Replacing accented characters DONE
# * Removing special characters DONE
# * Fixing quotation irregularities DONE
# * Removing double quotations DONE
# * Removing non alphanumeric characters (1) DONE
# * Removing non alphanumeric characters (2) DONE
# * Fixing comma and period irregularities DONE
# * Removing legal form DONE
# * Removing common words DONE
# * Fixing spelling variations DONE
# * Condensing DONE
# * Fixing umlaut variations DONE
#
# -----------------------------------------------------------------
# Standardization is done!
#
# x std_x
# 1: SGS-THOMSON MICROELECTRONICS SGSTHOMSONMICROELECTRONIC
# 2: S.G.S. THOMSON MICROELECTRONICS S.R.L. SGSTHOMSONMICROELECTRONIC
# 3: S.G.S. THOMSON MICROELECTRONICS, S.R.L. SGSTHOMSONMICROELECTRONIC
# 4: S.G.S.-THOMSON MICROELECTRONICS S.R.L. SGSTHOMSONMICROELECTRONIC
# 5: SGS - THOMSON MICROELECTRONICS S.A. SGSTHOMSONMICROELECTRONIC
# 6: SGS - THOMSON MICROELECTRONICS S.R.L. SGSTHOMSONMICROELECTRONIC
# 7: SGS - THOMSON MICROELECTRONICS, INC. SGSTHOMSONMICROELECTRONIC
# 8: SGS - THOMSON MICROELECTRONICS, S.R.L. SGSTHOMSONMICROELECTRONIC
# 9: SGS THOMSON MICROELECTRONICS S.A. SGSTHOMSONMICROELECTRONIC
# 10: SGS THOMSON MICROELECTRONICS S.R.L. SGSTHOMSONMICROELECTRONIC
# 11: SGS THOMSON MICROELECTRONICS SA SGSTHOMSONMICROELECTRONIC
# 12: SGS THOMSON MICROELECTRONICS SRL SGSTHOMSONMICROELECTRONIC
# 13: SGS THOMSON MICROELECTRONICS, INC. SGSTHOMSONMICROELECTRONIC
# 14: SGS THOMSON MICROELECTRONICS, S.A. SGSTHOMSONMICROELECTRONIC
# 15: SGS- THOMSON MICROELECTRONICS, S.A. SGSTHOMSONMICROELECTRONIC
# 16: SGS THOMSON MICROELECTRONICS, S.R.L. SGSTHOMSONMICROELECTRONIC
# 17: SGS- THOMSON MICROELECTRONICS<BR>(PTE) LTD. SGSTHOMSONMICROELECTRONIC
# 18: SGS THOMSON-MICROELECTRONICS SA SGSTHOMSONMICROELECTRONIC
# 19: SGS-THOMSON MICROELECTRONIC S.A. SGSTHOMSONMICROELECTRONIC
# 20: SGS-THOMSON MICROELECTRONICS SGSTHOMSONMICROELECTRONIC
# 21: SGS-THOMSON MICROELECTRONICS GMBH SGSTHOMSONMICROELECTRONIC
# 22: SGS-THOMSON MICROELECTRONICS INC. SGSTHOMSONMICROELECTRONIC
# 23: SGS-THOMSON MICROELECTRONICS LIMITED SGSTHOMSONMICROELECTRONIC
# 24: SGS-THOMSON MICROELECTRONICS LTD. SGSTHOMSONMICROELECTRONIC
# 25: SGS-THOMSON MICROELECTRONICS PTE LTD SGSTHOMSONMICROELECTRONIC
# 26: SGS-THOMSON MICROELECTRONICS PTE LTD. SGSTHOMSONMICROELECTRONIC
# 27: SGS-THOMSON MICROELECTRONICS PTE. LIMITED SGSTHOMSONMICROELECTRONIC
# 28: SGS-THOMSON MICROELECTRONICS PTE. LTD. SGSTHOMSONMICROELECTRONIC
# 29: SGS-THOMSON MICROELECTRONICS S. R. L. SGSTHOMSONMICROELECTRONIC
# 30: SGS-THOMSON MICROELECTRONICS S.A SGSTHOMSONMICROELECTRONIC
# 31: SGS-THOMSON MICROELECTRONICS S.A. SGSTHOMSONMICROELECTRONIC
# 32: SGS-THOMSON MICROELECTRONICS S.P.A. SGSTHOMSONMICROELECTRONIC
# 33: SGS-THOMSON MICROELECTRONICS S.R. L. SGSTHOMSONMICROELECTRONIC
# 34: SGS-THOMSON MICROELECTRONICS S.R.L SGSTHOMSONMICROELECTRONIC
# 35: SGS-THOMSON MICROELECTRONICS S.R.L. SGSTHOMSONMICROELECTRONIC
# 36: SGS--THOMSON MICROELECTRONICS S.R.L. SGSTHOMSONMICROELECTRONIC
# 37: SGS-THOMSON MICROELECTRONICS SA SGSTHOMSONMICROELECTRONIC
# 38: SGS-THOMSON MICROELECTRONICS SPA SGSTHOMSONMICROELECTRONIC
# 39: SGS-THOMSON MICROELECTRONICS SRL SGSTHOMSONMICROELECTRONIC
# 40: SGS-THOMSON MICROELECTRONICS SRL. SGSTHOMSONMICROELECTRONIC
# 41: SGS-THOMSON MICROELECTRONICS, GMBH SGSTHOMSONMICROELECTRONIC
# 42: SGS-THOMSON MICROELECTRONICS, INC SGSTHOMSONMICROELECTRONIC
# 43: SGS-THOMSON MICROELECTRONICS, INC. SGSTHOMSONMICROELECTRONIC
# 44: SGS-THOMSON MICROELECTRONICS, LTD. SGSTHOMSONMICROELECTRONIC
# 45: SGS-THOMSON MICROELECTRONICS, PTE LTD. SGSTHOMSONMICROELECTRONIC
# 46: SGS-THOMSON MICROELECTRONICS, S.A. SGSTHOMSONMICROELECTRONIC
# 47: SGS-THOMSON MICROELECTRONICS, S.R.L. SGSTHOMSONMICROELECTRONIC
# 48: SGS-THOMSON MICROELECTRONICS, S.RL SGSTHOMSONMICROELECTRONIC
# 49: SGS-THOMSON MICROELECTRONICS, SA SGSTHOMSONMICROELECTRONIC
# 50: SGS-THOMSON MICROELECTRONICS, SA. SGSTHOMSONMICROELECTRONIC
# 51: SGS-THOMSON MICROELECTRONICS, SRL SGSTHOMSONMICROELECTRONIC
# 52: SGS-THOMSON MICROELECTRONICS,S.R.L. SGSTHOMSONMICROELECTRONIC
# x std_x
#+END_SRC
** References
Magerman, T., Looy, V., Bart, & Song, X. (2006). /Data Production Methods for Harmonized Patent Statistics: Patentee Name Standardization/ (SSRN Scholarly Paper No. ID 944470). Rochester, NY: Social Science Research Network. Retrieved from http://papers.ssrn.com/abstract=944470
Thoma, G., Torrisi, S., Gambardella, A., Guellec, D., Hall, B. H., & Harhoff, D. (2010). Harmonizing and combining large datasets - an application to firm-level patent and accounting data. /National Bureau of Economic Research Working Paper Series/, (15851). Retrieved from http://www.nber.org/papers/w15851 http://www.nber.org/papers/w15851.pdf
Wasi, N., & Flaaen, A. (2015). Record linkage using Stata: Preprocessing, linking, and reviewing utilities. The Stata Journal, 15(3), 672-697. Retrieved from https://ebp-projects.isr.umich.edu/NCRN/papers/wasi_flaaen_statarecordlinkageutilities.pdf
*** Dependencies
#+caption: Hard dependencies (=Depends= field in =DESCRIPTION= file)
#+name: ob-r-pkg-table-depends
| name | version | comment |
|-------+---------+-------------------------------------------|
| [[https://www.r-project.org/][R]] | 4.2.0 | minimum R version to enable native piping |
**** Required packages
#+caption: Required packages (=Imports= field in the =DESCRIPTION= file)
#+name: ob-r-pkg-table-imports
| name | version | comment |
|------------+---------+-----------------------------------------------------------|
| [[https://rdatatable.gitlab.io/data.table/][data.table]] | | fast data.frames, used as main input and output data type |
| [[https://stringi.gagolewski.com/][stringi]] | | fast string manipulations |
| [[https://xml2.r-lib.org/][xml2]] | | cleaning web syntax |
| [[https://mllg.github.io/checkmate/][checkmate]] | | function arguments checker, ensures stability |
#+TBLFM: $2='(org-sbe get-package-vesion (pkg $$1))
**** Suggested packages
#+caption: Suggested packages (=Suggests= field in the =DESCRIPTION= file)
#+name: ob-r-pkg-table-suggests
| name | version | comment |
|------------+---------+------------------------------------------------------------------------|
| [[https://github.com/markvanderloo/tinytest/blob/master/pkg/README.md][tinytest]] | | package development (unit testing) |
| [[https://cran.r-project.org/web/packages/fastmatch/index.html][fastmatch]] | | can speed things up |
| [[https://rstudio.github.io/htmltools/index.html][htmltools]] | | used for escaping html in procedures descriptions before visualization |
| [[http://rich-iannone.github.io/DiagrammeR/docs.html][DiagrammeR]] | | needed for visualizing procedures lists |
#+TBLFM: $2='(org-sbe get-package-vesion (pkg $$1))
**** Development dependencies and tools
These packages are used for developing and building ~nstandr~
#+caption: Useful packages for development
#+name: ob-r-pkg-table-suggests-dev
| name | version | comment |
|----------------+---------+-------------------------------|
| [[https://devtools.r-lib.org/][devtools]] | | builds the package |
| [[https://roxygen2.r-lib.org/][roxygen2]] | | makes docs |
| [[https://github.com/REditorSupport/languageserver][languageserver]] | | provides some IDE consistency |
| [[https://usethis.r-lib.org/][usethis]] | | repo utils |
| [[https://moodymudskipper.github.io/boomer][boomer]] | | can be used for debugging |
#+TBLFM: $2='(org-sbe get-package-vesion (pkg $$1))
* Citing
#+begin_src latex
@misc{Vlasov2022,
author = {Vlasov, Stanislav A.},
title = {nstandr - an R package that makes standardized organizational names.},
year = {2020},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/stasvlasov/nstandr}},
commit = {10293d9d5f0687b5b7dc784d8a75942b7175f4a1}
}
#+end_src
#+BEGIN_SRC yaml :tangle info.yml :noweb yes
type: software
ref: nstandr
lang: R
title: <<ob-r-pkg-utils-get-macro-by-keyword(key = "title")>>
author: <<ob-r-pkg-utils-get-macro-by-keyword(key = "author")>>
date: <<ob-r-pkg-utils-get-macro-by-keyword(key = "date")>>
version: <<ob-r-pkg-utils-get-macro-by-keyword(key = "r_package_version")>>
url: <<ob-r-pkg-utils-get-macro-by-keyword(key = "r_package_url")>>
publisher: GitHub
commit:
implements:
- ref: magerman2006_std_names
- ref: dm0082 # stata package
- ref: nber_pdp_stdname
#+END_SRC
* Development
For this package development I use Emacs' org-mode that allows literate programming. Literate programming in this case provides ways to do the following:
- Use tables of internal data for standardization in human readable format and convert it to .rda when deploying package
- Write function definition and tests next to other but tangle it from .org file to different directories and .r files on deployment
** Naming Convention
- All functions should have a verb
- Variables should not have a verb
- Everything that is available to user should start with either nstandr (data) or standardize (functions)
- Internal functions and variables does not have to have this prefix
- Second part of the name is a source if the function or a data is taken from this source (e.g. standardize.magerman.remove.common.words)
- The rest is the name of the function or a variable as precise and informative as possible
- Data for string substitutions, removals or pattern recognition should be named as "patterns" (e.g., nstandr.magerman.patterns.special.characters)
* Code
** Package documentation
:PROPERTIES:
:ID: org:g5wa69d1ffi0
:END:
#+BEGIN_SRC R :tangle R/nstandr.r :noweb yes
#' @details
#' <<r-pkg-description>>
#' @keywords internal
"_PACKAGE"
#+END_SRC
** get_dots (get from external package)
:PROPERTIES:
:ID: org:6lik3kf0kcj0
:END:
[[https://github.com/stasvlasov/dots][GitHub - stasvlasov/dots: An alternative way to interact with "..." dots arguments (aka ellipses).]]
#+BEGIN_SRC R :tangle R/0_get_dots.r :noweb yes
<<ob-r-pkg-utils-get-external-src(org-file="~/org/research/dots/dots.src.org",src-block="get_dots")>>
#+END_SRC
** basic procedures
*** standardize_toupper
:PROPERTIES:
:ID: org:xys0f8s0lei0
:END:
#+BEGIN_SRC R :tangle R/nstandr.r
##' @eval attr(standardize_toupper, "description")
##'
##' @param x data
##'
##' @inheritDotParams standardize_options
##'
##' @return updated data (as data.table)
##' @export
standardize_toupper <- function(x, ...) {
get_target(x) |>
toupper() |>
inset_target(x)
}
attr(standardize_toupper, "description") <-
"Uppercases vector of interest in the object (table)"
#+END_SRC
#+BEGIN_SRC R :tangle inst/tinytest/test_standardize_toupper.r
expect_equal(data.table(name = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd"
, "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
, "MSLab Co."
, "MSLaeb Comp."
, "MSLab Comp."
, "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ")
, foo = "lalala" ) |>
standardize_toupper(col = 2, name = "bar")
, structure(list(name = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ"
), foo = c("LALALA", "LALALA", "LALALA", "LALALA", "LALALA",
"LALALA")), row.names = c(NA, -6L), class = c("data.table", "data.frame"
)))
#+END_SRC
*** standardize_remove_brackets
:PROPERTIES:
:ID: org:9ew0f8s0lei0
:END:
#+BEGIN_SRC R :tangle R/nstandr.r
##' @eval attr(standardize_remove_brackets, "@title")
##' @param x object (table)
##' @inheritDotParams standardize_options
##' @return updated object
##'
##' @export
standardize_remove_brackets <- function(x, ...) {
get_target(x) |>
stringi::stri_replace_all_regex("<[^<>]*>|\\([^()]*\\)|\\{[^{}]*\\}|\\[[^\\[\\]]*\\]", "") |>
inset_target(x)
}
attr(standardize_remove_brackets, "@title") <- "Removes brackets and content in brackets"
#+END_SRC
#+BEGIN_SRC R :tangle inst/tinytest/test_standardize_remove_brackets.r
## remove.brackets breaks the encoding (so it is better to apply decoding first)
expect_equal(standardize_remove_brackets("fa\xE7ile (lalala) lkj (sdfs) AAA [sdf]")
, "fa�ile lkj AAA ")
expect_equal(standardize_remove_brackets("fa7ile (lalala) lkj (sdfs) AAA [sdf]")
, "fa7ile lkj AAA ")
#+END_SRC
*** standardize_remove_quotes
:PROPERTIES:
:ID: org:4vz0f8s0lei0
:END:
#+BEGIN_SRC R :tangle R/nstandr.r
##' Removes double quotes
##'
##' @param x an object
##' @inheritDotParams standardize_options
##' @return updated object
##' @export
standardize_remove_quotes <- function(x, ...) {
get_target(x) |>
stringi::stri_replace_all_regex("\"", "") |>
inset_target(x)
}
#+END_SRC
*** standardize_dehtmlize
:PROPERTIES:
:ID: org:4tffib50bci0
:END:
#+BEGIN_SRC R :tangle R/nstandr.r
#' Converts HTML characters to UTF-8
#'
#' The method is about 1/3 faster than htmlParse but it is still quite slow
#' @param x object (table)
#' @param as_single_string If set then collapse characters in the main column of the `x` (i.e., `x.col`) as to a single string. It will increase performance (at least for relatively short tables). Default is FALSE
#' @param as_single_string_sep delimiter for collapsed strings to uncollapse it later. Default is "#_|".
#' @param use_read_xml If set the it will parse XML. Default is FALSE which means it parses HTML
#' @inheritDotParams standardize_options
#' @return updated object
#' @references http://stackoverflow.com/questions/5060076
#'
#' @export
standardize_dehtmlize <- function(x
, as_single_string = FALSE
, as_single_string_sep = "#_|"
, use_read_xml = FALSE
, ...) {
x_vector <- get_target(x)
if(as_single_string) {
x_vector <- paste0(x_vector, collapse = as_single_string_sep)
x_vector <- paste0("<x>", x_vector, "</x>")
x_vector <-
(if(use_read_xml) {
xml2::read_xml(x_vector)
} else {
xml2::read_html(x_vector)
}) |> xml2::xml_text()
strsplit(x_vector, as_single_string_sep, fixed = TRUE)[[1]]
} else {
sapply(x_vector, \(str) {
str <- paste0("<x>", str, "</x>")
(if(use_read_xml) {
xml2::read_xml(str)
} else {
xml2::read_html(str)
}) |> xml2::xml_text()
}, USE.NAMES = FALSE)
} |> inset_target(x)
}
#+END_SRC
#+BEGIN_SRC R :tangle inst/tinytest/test_standardize_dehtmlize.r
expect_equal(c("abcd", "& ' >", "&", "€ <") |>
standardize_dehtmlize()
, c("abcd", "& ' >", "&", "€ <"))
#+END_SRC
*** standardize_detect_enc
:PROPERTIES:
:ID: org:e2bfib50bci0
:END:
#+BEGIN_SRC R :tangle R/nstandr.r
#' Detects string encoding
#' @param x object
#' @param output_codes_col_name Same as in [detect_patterns()]
#' @param return_only_codes Same as in [detect_patterns()]
#' @param ...
#' @inheritDotParams standardize_options
#' @return updated object
#'
#' @export
standardize_detect_enc <- function(x
, output_codes_col_name = "{col_name_}encoding"
, return_only_codes = FALSE
, ...) {
available_enc_list <- iconvlist()
x_vector <- get_target(x) |>
stringi::stri_enc_detect() |>
lapply(function(enc) {
enc <- toupper(enc[["Encoding"]])
first_ok_enc <- which(enc %in% available_enc_list)[1]
if(length(first_ok_enc) == 0) ""
else enc[[first_ok_enc]]
}) |> unlist()
if(return_only_codes) {
x_vector
} else {
inset_target(x_vector
, x
, output_placement = "omit"
, output_copy_col_name = output_codes_col_name
, append_output_copy = TRUE)
}
}
#+END_SRC
#+BEGIN_SRC R :tangle inst/tinytest/test_standardize_detect_enc.r
## This fails depending on platform (Windows) on R release.
## expect_equal(data.table::data.table(data = c("FAÇILE"
## , "fa\xE7ile"
## , "c\u00b5c\u00b5ber")
## , coffee = "Yes, please!") |>
## standardize_detect_enc(return_only_codes = TRUE)
## , c("WINDOWS-1254", "ISO-8859-9", "UTF-8"))
## ## TODO col naming
## expect_equal(c("FAÇILE"
## , "fa\xE7ile"
## , "c\u00b5c\u00b5ber") |>
## standardize_detect_enc()
## , structure(list(x = c("FAÇILE", "fa\xe7ile", "cµcµber"), x_encoding = c("WINDOWS-1254",
## "ISO-8859-9", "UTF-8")), row.names = c(NA, -3L), class = c("data.table",
## "data.frame")))
## expect_equal(data.table::data.table(data = c("FAÇILE"
## , "fa\xE7ile"
## , "c\u00b5c\u00b5ber")
## , coffee = "Yes, please!") |>
## standardize_detect_enc(output_codes_col_name = "{col_name}_lala")
## , structure(list(data = c("FAÇILE", "fa\xe7ile", "cµcµber"),
## coffee = c("Yes, please!", "Yes, please!", "Yes, please!"
## ), data_lala = c("WINDOWS-1254", "ISO-8859-9", "UTF-8")), row.names = c(NA,
## -3L), class = c("data.table", "data.frame")))
#+END_SRC
*** standardize_toascii
:PROPERTIES:
:ID: org:mzn0tpb0wei0
:END:
#+BEGIN_SRC R :tangle R/nstandr.r
#' Translates non-ascii symbols to its ascii equivalent
#'
#' @param x String to translate
#' @param detect_encoding Detect encoding of individual elements (slower). Allows to work with mixed encodings.
#' @inheritDotParams standardize_options
#'
#' @export
standardize_toascii <- function(x
, detect_encoding = FALSE
, ...) {
str <- get_target(x)
utf <- nstandr_patterns_ascii$utf |> paste(collapse = "")
ascii <- nstandr_patterns_ascii$ascii |> paste(collapse = "")
(if(detect_encoding) {
mapply(
\(name, enc) chartr(utf, ascii, iconv(name, from = enc, to = "UTF-8", sub = ""))
, str
, standardize_detect_enc(str, return_only_codes = TRUE)
, SIMPLIFY = FALSE, USE.NAMES = FALSE) |>
unlist() |>
iconv(to = "ASCII", sub = "")
} else {
## stringi::stri_enc_toascii(str)
chartr(utf, ascii, enc2utf8(str)) |>
iconv(to = "ASCII", sub = "")
}) |> inset_target(x)
}
#+END_SRC
#+BEGIN_SRC R :tangle inst/tinytest/test_standardize_toascii.r
## This fails depending on platform (Windows) on R release.
## expect_equal( c("FAÇILE"
## , "fa\xE7ile"
## , "c\u00b5c\u00b5ber") |>
## data.table::data.table("coffee") |>
## standardize_toascii(detect_encoding = TRUE)
## , structure(list(V1 = c("FAAILE", "facile", "cucuber"), V2 = c("coffee",
## "coffee", "coffee")), row.names = c(NA, -3L), class = c("data.table",
## "data.frame")))
#+END_SRC
**** ASCII equivalents table
A table for converting ~ŠŒŽšœžŸ¥µÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýÿ~ string to ~SOZsozYYuAAAAAAACEEEEIIIIDNOOOOOOUUUUYsaaaaaaaceeeeiiiionoooooouuuuyy~
#+NAME: ob-r-pkg-data-nstandr-patterns-ascii
| utf | ascii |
|-----+-------|
| Š | S |
| Œ | O |
| Ž | Z |
| š | s |
| œ | o |
| ž | z |
| Ÿ | Y |
| ¥ | Y |
| µ | u |
| À | A |
| Á | A |
| Â | A |
| Ã | A |
| Ä | A |
| Å | A |
| Æ | A |
| Ç | C |
| È | E |
| É | E |
| Ê | E |
| Ë | E |
| Ì | I |
| Í | I |
| Î | I |
| Ï | I |
| Ð | D |
| Ñ | N |
| Ò | O |
| Ó | O |
| Ô | O |
| Õ | O |
| Ö | O |
| Ø | O |
| Ù | U |
| Ú | U |
| Û | U |
| Ü | U |
| Ý | Y |
| ß | s |
| à | a |
| á | a |
| â | a |
| ã | a |
| ä | a |
| å | a |
| æ | a |
| ç | c |
| è | e |
| é | e |
| ê | e |
| ë | e |
| ì | i |
| í | i |
| î | i |
| ï | i |
| ð | o |
| ñ | n |
| ò | o |
| ó | o |
| ô | o |
| õ | o |
| ö | o |
| ø | o |
| ù | u |
| ú | u |
| û | u |
| ü | u |
| ý | y |
| ÿ | y |
*** standardize_squish_spaces
:PROPERTIES:
:ID: org:dlp0f8s0lei0
:END:
#+BEGIN_SRC R :tangle R/standardize_squish_spaces.r
#' Removes redundant whitespases
#' @param x table or vector
#'
#' @param wrap_in_spaces If set then adds leaing and ending spaces. Default is FALSE.
#'
#' @inheritDotParams standardize_options
#'
#' @return updated table or vector
#' @export
standardize_squish_spaces <- function(x, wrap_in_spaces = FALSE, ...) {
get_target(x) |>
stringi::stri_replace_all_regex("\\s+", " ") |>
stringi::stri_trim_both() |> (
\(y) if(wrap_in_spaces) paste0(" ", y, " ") else y
)() |>
inset_target(x)
}
#+END_SRC
#+BEGIN_SRC R :tangle inst/tinytest/test_standardize_squish_spaces.r
expect_equal(standardize_squish_spaces(" String with trailing, middle, and leading white space\t"
, wrap_in_spaces = TRUE)
, " String with trailing, middle, and leading white space ")
expect_equal(standardize_squish_spaces("\n\nString with excess, trailing and leading white space\n\n"
, wrap_in_spaces = FALSE)
, "String with excess, trailing and leading white space")
#+END_SRC
** utils
*** standardize_make_procedures_list
:PROPERTIES:
:ID: org:i762gum0fqi0
:END:
#+name: standardize_make_procedures_list
#+BEGIN_SRC R :tangle R/0_standardize_make_procedures_list.r
##' Makes list of procedures calls from table.
##'
##' Table should have at least two columns - messages and fuctions
##' calls. Each function call should be a string of the following
##' format "'function.name', arg1 = val1, arg2 = val2" (same as
##' arguments for `do.call` function).
##'
##' @param procedures_table Table to use
##' @param message_field name of the column with messages that will be
##' displayed when each call is executed
##' @param function_call_field name of the column where function
##' (standardization procedures) calls are listed.
##' @param no_field name of the column where the number of procedure
##' is specified. Also this field indicates if the row in the
##' table is just a comment in which case it will be removed if
##' `remove_comments` is set (which is set by default)
##' @param remove_comments Whether to remove comments.
##' @param sort_by_no_field Whether to sort the list by col named
##' `no_field`
##' @param comments Values (character string) in the first col that
##' makes entire row as commented out
##' @return List of named function calls. Names are messages.
##'
standardize_make_procedures_list <- function(procedures_table
, message_field = "message"
, function_call_field = "function.call"
, no_field = "no"
, remove_comments = TRUE
, sort_by_no_field = TRUE
, comments = c("#", "-", "")) {
## procedures_table <- defactor(procedures_table)
if(remove_comments) {
procedures_table <-
procedures_table[
!(procedures_table[[no_field]] %in% comments)
, ]
}
if(sort_by_no_field) {
procedures_table <-
procedures_table[
procedures_table[[no_field]] |>
as.numeric() |>
order()
, ]
}
procedures <-
procedures_table[[function_call_field]] |> (
\(y) paste0("list(", y, ")")
)() |>
lapply(\(str) eval(parse(text = str))) |>
lapply(\(lst) if(length(lst) == 1) unlist(lst) else lst)
names(procedures) <- procedures_table[[message_field]]
return(procedures)
}
#+END_SRC
#+BEGIN_SRC R :tangle inst/tinytest/test_standardize_make_procedures_list.r
standardize_make_procedures_list <- nstandr:::standardize_make_procedures_list
expect_equal(data.frame(no = c(3,2,"-", "")
, message = c("hello", "world", "man", "dfsdf")
, function.call = c("'c', 1, b=3", "'sum', 8,8,9", "'version'", "")) |>
standardize_make_procedures_list()
, list(world = list("sum", 8, 8, 9), hello = list("c", 1, b = 3)))
#+END_SRC
*** standardize_x_split
:PROPERTIES:
:ID: org:ije1f8s0lei0
:END:
#+BEGIN_SRC R :tangle R/nstandr.r
##' Splits the object (table) in chunks by rows
##'
##' Convenient to apply some function to the table in chunks, e.g., if you want to add display of progress.
##'
##' @param x object or table
##' @param by number of rows to split by
##' @param len length of the table (nrow). If it is NULL then use x_length(x)
##'
##' @return List of (sub)tables
standardize_x_split <- function(x, by, len = NULL) {
if(is.null(len)) len <- x_length(x)
split(x, rep(seq(1, len %/% by +1)
, each = by
, length.out = len))
}
#+END_SRC
#+BEGIN_SRC R :tangle inst/tinytest/test_standardize_x_split.r
standardize_x_split <- nstandr:::standardize_x_split
expect_equal(data.table(name = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd"
, "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
, "MSLab Co."
, "MSLaeb Comp."
, "MSLab Comp."
, "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ") |>
rep(50)
, foo = "lalala" ) |>
standardize_x_split(10) |>
sapply(class)
, structure(c("data.table", "data.frame", "data.table", "data.frame",
"data.table", "data.frame", "data.table", "data.frame", "data.table",
"data.frame", "data.table", "data.frame", "data.table", "data.frame",
"data.table", "data.frame", "data.table", "data.frame", "data.table",
"data.frame", "data.table", "data.frame", "data.table", "data.frame",
"data.table", "data.frame", "data.table", "data.frame", "data.table",
"data.frame", "data.table", "data.frame", "data.table", "data.frame",
"data.table", "data.frame", "data.table", "data.frame", "data.table",
"data.frame", "data.table", "data.frame", "data.table", "data.frame",
"data.table", "data.frame", "data.table", "data.frame", "data.table",
"data.frame", "data.table", "data.frame", "data.table", "data.frame",
"data.table", "data.frame", "data.table", "data.frame", "data.table",
"data.frame"), .Dim = c(2L, 30L), .Dimnames = list(NULL, c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30"))))
expect_equal(c("MÄKARÖNI ETÖ FKÜSNÖ Ltd"
, "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
, "MSLab Co."
, "MSLaeb Comp."
, "MSLab Comp."
, "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ") |>
rep(50) |>
standardize_x_split(10)
, list(`1` = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp."), `2` = c("MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
), `3` = c("MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ"
), `4` = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp."), `5` = c("MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
), `6` = c("MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ"
), `7` = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp."), `8` = c("MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
), `9` = c("MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ"
), `10` = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp."), `11` = c("MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
), `12` = c("MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ"
), `13` = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp."), `14` = c("MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
), `15` = c("MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ"
), `16` = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp."), `17` = c("MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
), `18` = c("MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ"
), `19` = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp."), `20` = c("MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
), `21` = c("MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ"
), `22` = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp."), `23` = c("MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
), `24` = c("MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ"
), `25` = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp."), `26` = c("MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
), `27` = c("MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ"
), `28` = c("MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp."), `29` = c("MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>"
), `30` = c("MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ",
"MÄKARÖNI ETÖ FKÜSNÖ Ltd", "MSLab CÖ. <a href=lsdldf> <br> <\\a>",
"MSLab Co.", "MSLaeb Comp.", "MSLab Comp.", "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝŸ"
)))
#+END_SRC
*** unlist_if_possible
:PROPERTIES:
:ID: org:3ya1f8s0lei0
:END:
#+BEGIN_SRC R :tangle R/0_unlist_if_possible.r
##' If column in the `x` table is list unlist it if possible
##' @param x object
##' @param replace_zero_length_with Default is replace NULLs with NA_character_ because vector of just NA is a logical class
##' @param remove_empty_values remove NA, "", etc. from list elements. (see [standardize_omit_empty()])
##' @return updated object
##' @export
unlist_if_possible <- function(x
, replace_zero_length_with = NA_character_
, remove_empty_values = TRUE) {
if(is.list(x)) {
if(remove_empty_values) {
x <- lapply(x, standardize_omit_empty)
}
len <- sapply(x, length)
if(all(len == 1)) {
unlist(x, recursive = FALSE, use.names = FALSE)
} else if(all(len %in% 0:1)) {
x[len == 0] <- replace_zero_length_with
unlist(x, recursive = FALSE, use.names = FALSE)
} else {
return(x)
}
} else {
## assume that x is atomic
return(x)
}
}
#+END_SRC
#+BEGIN_SRC R :tangle inst/tinytest/test_unlist_if_possible.r
expect_equal(c(1,2,3,4) |> unlist_if_possible()