-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathwordsmith.rb
1023 lines (897 loc) · 36.8 KB
/
wordsmith.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/ruby env
#Encoding: UTF-8
require 'ostruct'
require 'optparse'
require 'csv'
require 'yaml'
# Print the title of the program on run
def title()
@version = "v2.1.1"
puts "wordsmith #{@version}"
puts "Written by: Sanjiv Kawa (@kawabungah) & Tom Porter (@porterhau5)"
puts ""
end
# Cross-platform way of finding an executable in the $PATH.
def which(cmd)
exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
exts.each { |ext|
exe = File.join(path, "#{cmd}#{ext}")
return exe if File.executable?(exe) && !File.directory?(exe)
}
end
return nil
end
# -E flag will print examples
def examples()
puts "Input names:"
puts "------------"
puts "Valid inputs for wordsmith (using -I option) are based on nodes located in the \"data\" directory."
puts "The top-level nodes are countries labeled by their 3-letter ISO Country Code:"
puts " data/gbr : Great Britain"
puts " data/usa : United States"
puts " data/deu : Germany"
puts " etc."
puts "\nSome countries are divided into states, provinces, counties, or municipalities. These child nodes"
puts "are nested beneath the parent:"
puts " data/can/on : Ontario, Canada"
puts " data/usa/nc/raleigh : Raleigh, NC, USA"
puts " data/gbr/eng/sx/east_sussex : East Sussex, Sussex, England, Great Britain"
puts "\nInputs for wordsmith use these node paths, but with a hyphen (-) delimiter:"
puts " ruby wordsmith.rb -I gbr [options]"
puts " ruby wordsmith.rb -I can-on [options]"
puts " ruby wordsmith.rb -I usa-nc-raleigh [options]"
puts "\nIf you prefer to not dig through the \"data\" directory looking for potential inputs or attributes,"
puts "use -C to show children nodes for a given input:"
puts " ruby wordsmith.rb -I all -C (show all potential nodes)"
puts " ruby wordsmith.rb -I usa -C (show children nodes of USA)"
puts " ruby wordsmith.rb -I gbr-eng -C (show children nodes of England)"
puts "\nAlternatively, inputs can also be user-defined through the use of the \"regions.csv\" file. Wordsmith"
puts "ships with a few regions already defined, such as:"
puts " Continents : africa, asia, europe, etc. "
puts " US regions : southeast, newengland, greatlakes, etc."
puts " Unions/Assoc. : eu, asean, nafta, etc."
puts "\nThese region aliases can be found by inspecting \"regions.csv\" or by using the -R option:"
puts " ruby wordsmith.rb -R"
puts "\nBy supplying a number (N) to the -I option, inputs will instead be the N most populous countries."
puts "The below example will use the 10 most populous countries as input:"
puts " ruby wordsmith.rb -I 10 [options]"
puts ""
puts "Attributes:"
puts "-----------"
puts "Each node may have one or more attributes, such as cities, roads, colleges, etc. Wordsmith will recurse"
puts "every child node and grab data for the specified attribute. For example, the following syntax starts at"
puts "the top-level \"usa\" node and recurses into every sub-directory looking for zip codes:"
puts " ruby wordsmith.rb -I usa -z"
puts "\nSome attributes are widely-used, such as roads (-r), cities (-c), and a handful of others. Use the -h"
puts "option to see a listing of those attributes. Some attributes may be unique to an area and don't have a"
puts "dedicated option. These can still be grabbed by using the -b option. The -b option will look for all"
puts "txt files that are not one of attributes with a dedicated option. For example, if someone generated data"
puts "for all of the lakes in Minnesota and placed it in \"data/usa/mn/lakes.txt\", this can be grabbed using:"
puts " ruby wordsmith.rb -I usa-mn -b"
puts "\nExtending wordsmith to incorporate new data is as simple as creating a \".txt\" file in the proper data"
puts "directory. If you have data that you'd think would benefit other users, please connect with us on GitHub."
puts ""
puts "Basic usage:"
puts "------------"
puts "Show all children nodes and attributes for Great Britain"
puts " ruby wordsmith.rb -I gbr -C"
puts "\nGrab all of the most popular names for USA"
puts " ruby wordsmith.rb -I usa -N"
puts "\nGrab all of the zip codes for California"
puts " ruby wordsmith.rb -I usa-ca -z"
puts "\nGrab all of the sports teams for Charlotte, NC, USA"
puts " ruby wordsmith.rb -I usa-nc-charlotte -t -m"
puts "\nGrab all of the landmarks for California, Montana, and Florida"
puts " ruby wordsmith.rb -I usa-ca,usa-mt,usa-fl -l"
puts "\nGrab all of the cities for the 25 most populous countries"
puts " ruby wordsmith.rb -I 25 -c"
puts ""
puts "\nUsing regions:"
puts "--------------"
puts "Show regions defined in regions.csv"
puts " ruby wordsmith.rb -R"
puts "\nGrab all of the cities for the European Union"
puts " ruby wordsmith.rb -I eu -c"
puts "\nGrab all of the roads for New England (U.S.)"
puts " ruby wordsmith.rb -I newengland -r"
puts ""
puts "\nOutput formatting:"
puts "------------------"
puts "Grab all colleges for California, mangle the output, convert to lowercase"
puts " ruby wordsmith.rb -I usa-ca -f -m -j"
puts "\nGrab all roads for England with a minimum character length of 8"
puts " ruby wordsmith.rb -I gbr-eng -r -k 8"
puts "\nGrab everything for Italy, write to file named italy.txt"
puts " ruby wordsmith.rb -I ita -a -o italy.txt"
puts "\nCreate a mega wordlist containing all countries with all options, quiet output, write to file named all.txt"
puts " ruby wordsmith.rb -I all -a -m -q -o all.txt"
puts ""
puts "\nWeb scraping:"
puts "-------------"
puts "Run CeWL against https://www.popped.io, mangle the output"
puts " ruby wordsmith.rb -d https://www.popped.io -m"
puts "\nRun CeWL against list of URLs contained in urls.txt, write to file out.txt"
puts " ruby wordsmith.rb -i urls.txt -m -o out.txt"
exit
end
# show children nodes and their attributes, basically a tree view
def show_children(inputs)
puts "Format:"
puts "boundary-name : attribute1 attribute2 attribute3 etc."
inputs.each do |input|
subdirs = Dir.glob("#{input}/**/*/").sort
files = Dir.glob("#{input}/*.txt").sort
if not subdirs.empty? or not files.empty?
puts ""
# change "./data/abc/de/fegh" to "abc-de-fegh"
dir = "#{input.sub(/^.\/data\//, '').gsub(/\//,'-').chomp("-")}"
out = dir
if not files.empty?
out = "#{out} : "
files.each do |f|
out = out + " " + f.split("/").last.split(".").first
end
end
puts "#{out}\n"
subdirs.each do |subdir|
subdirout = "#{subdir.sub(/^.\/data\//, '').gsub(/\//,'-').chomp("-")}"
depth = subdirout.count("-") - dir.count("-")
out = "| " * (depth - 1) + "|-- " + subdirout
files = Dir.glob("#{subdir}*.txt").sort
if not files.empty?
out = "#{out} : "
files.each do |f|
out = out + " " + f.split("/").last.split(".").first
end
end
puts out
end
end
end
exit
end
# show defined regions from regions.csv
def show_regions()
regions = CSV.read('./data/regions.csv')
regions.each do |region|
# skip comments and improperly-formatted lines in CSV
next unless region.length == 3 and not region[0].start_with? '#'
puts "Alias: #{region[0]}"
puts "Description: #{region[1]}"
puts "Members: #{region[2]}"
puts ""
end
puts "[*] Regions can be modified by editing data/regions.csv"
exit
end
@boundaries = []
# check to see if the user supplied input is a region
def is_region(regions,boundary)
# first, check regions for each input
found = false
boundaries = []
regions.each do |region|
# skip comments and improperly-formatted lines in CSV
next unless region.length == 3 and not region[0].start_with? '#'
# if provided arg is the name of first CSV row element
if boundary.casecmp(region[0]) == 0
found = true
# parse out locations from last CSV row element
region[2].split.each do |r|
boundaries = is_region(regions,r)
end
end
end
if not found
@boundaries.push(boundary)
end
end
# verify each provided input is legitimate
def validate_boundaries(options)
# boundaries is for elements to check in data/ path
boundaries = []
# inputs is return array
inputs = []
regions = CSV.read('data/regions.csv')
options.input.each do |boundary|
is_region(regions,boundary)
boundaries = @boundaries
end
# verify each boundary's data path can be found
boundaries.each do |boundary|
dir_path = "./data"
nodes = boundary.split("-")
depth = nodes.length
count = 0
while count < depth
dir_path = "#{dir_path}/#{nodes[count].downcase()}"
if not Dir.exist?(dir_path)
puts "[!] Exiting - input \'#{boundary}\' not found!"
abort
elsif count == depth - 1
inputs.push(dir_path)
end
count += 1
end
end
return inputs.sort.uniq
end
# run CeWL against the specified URL
def scrapeSingle(options)
# check for CeWL executable
cewlpath = which("cewl")
if cewlpath.nil?
puts "[!] cewl executable not found in path! Skipping URL."
puts "[*] See README for CeWL installation instructions."
return
end
url = options.url
puts "[+] Running CeWL against: #{url}"
cewl = %x[#{cewlpath} #{url}]
# convert to array, remove first two entries (CeWL banner and blank line)
# sort and uniq
result = cewl.split("\n").drop(2).sort.uniq
if not result.empty?
cewlpp = []
cewlpp = post_processor(nil, options, result)
if not options.quiet
puts cewlpp
puts ""
end
puts "[*] Total unique words that CeWL grabbed from #{url} is: #{cewlpp.length}"
else
puts "[!] #{url} seems to be incorrect. Please check it in a web browser and try again."
puts "[*] Total unique words that CeWL grabbed from #{url} is: 0"
end
end
# run CeWL against a wordlist containing multiple URLs
def scrapeMultiple(options)
# check for CeWL executable
cewlpath = which("cewl")
if cewlpath.nil?
puts "[!] cewl executable not found in path! Skipping URLs."
puts "[*] See README for CeWL installation instructions."
return
end
input = options.multi
if File.exist?(input) == false
puts " "
puts "[!] File: #{input} does not exist! Skipping URLs."
return
end
allCewls = []
count = 1
lineCount = File.foreach(input).count
File.foreach(input) do |url|
puts "[+] Running CeWL against: #{url.chomp} (#{count}/#{lineCount})"
cewl = %x[#{cewlpath} #{url}]
if cewl.include? "Unable to connect to the site"
puts "-- Unable to connect to the site"
count += 1
else
cewlarr = cewl.split("\n").drop(2).sort.uniq
allCewls = allCewls + cewlarr
puts "[*] Total words that CeWL grabbed from #{url.chomp} is: #{cewlarr.length}"
count += 1
end
end
# convert to array, remove first two entries (CeWL banner and blank line)
# sort and uniq
result = allCewls.sort.uniq
if not result.empty?
allCewlspp = []
allCewlspp = post_processor(nil, options, result)
if not options.quiet
puts allCewlspp
puts ""
end
puts "[*] Total unique words that CeWL grabbed from all URLs is: #{allCewlspp.length}"
else
puts "[!] All URLs seems to be incorrect. Please check them in a web browser and try again."
puts "[*] Total unique words that CeWL grabbed from all URLS is: 0"
end
end
# all data eventually gets shoved into this function where all post-processing occurs
def post_processor(dir_path, options, array)
#words to be manipulated
inputArray = []
array.each {|word| inputArray.push word.to_s.chomp}
# array to store staged and final output
stageArr = []
finalArr = []
# split words by spaces before manipulating
if options.split
count = 0
length = array.length
until count == length
words = array[count].split()
wlength = words.length
wcount = 0
until wcount == wlength
inputArray.push words[wcount].to_s
wcount += 1
end
count += 1
end
end
# add pre-manipulated words
inputArray.each {|word| stageArr.push word.to_s}
# add words with special characters removed
if options.specials
inputArray.each {|word| stageArr.push word.to_s.gsub(/[^0-9A-Za-z]/, '')}
end
# add words with spaces removed
if options.spaces
inputArray.each {|word| stageArr.push word.to_s.gsub(/[ ]/, '')}
end
# add stageArr to finalArr before output options
stageArr.each {|word| finalArr.push word.to_s}
# prepend/append area codes to all words
if options.prependphone || options.appendphone
# use usa for religion/language phones for now
phone_file = ""
if dir_path.nil?
phone_file = Dir.glob("usa/**/areacodes.txt")
else
phone_file = Dir.glob("#{dir_path}/**/areacodes.txt")
end
phone_file.each do |f|
file = File.open(f, "rb", :encoding => "ISO-8859-1:UTF-8")
contents = ""
contents = file.read
file.close
contents = contents.split("\n")
if options.prependphone
stageArr.each {|word|
contents.each {|code|
finalArr.push code + word } }
end
if options.appendphone
stageArr.each {|word|
contents.each {|code|
finalArr.push word + code } }
end
end
end
# prepend/append zip codes to all words
if options.prependzip || options.appendzip
# use usa for religion/language zips for now
zip_file = ""
if dir_path.nil?
zip_file = Dir.glob("usa/**/zipcodes.txt")
else
zip_file = Dir.glob("#{dir_path}/**/zipcodes.txt")
end
zip_file.each do |f|
file = File.open(f, "rb", :encoding => "ISO-8859-1:UTF-8")
contents = ""
contents = file.read
file.close
contents = contents.split("\n")
if options.prependzip
stageArr.each {|word|
contents.each {|code|
finalArr.push code + word } }
end
if options.appendzip
stageArr.each {|word|
contents.each {|code|
finalArr.push word + code } }
end
end
end
# prepend user-supplied wordlist
if options.prependwordlist
file = File.open(options.prependwordlist, "rb", :encoding => "ISO-8859-1:UTF-8")
contents = ""
contents = file.read
file.close
contents = contents.split("\n")
stageArr.each {|word|
contents.each {|ele|
finalArr.push ele + word } }
end
# append user-supplied wordlist
if options.appendwordlist
file = File.open(options.appendwordlist, "rb", :encoding => "ISO-8859-1:UTF-8")
contents = ""
contents = file.read
file.close
contents = contents.split("\n")
stageArr.each {|word|
contents.each {|ele|
finalArr.push word + ele } }
end
# remove words with less than options.minlength characters
if options.minlength.nil? == false
finalArr.delete_if {|word| word.length < options.minlength}
end
# remove words with more than options.maxlength characters
if options.maxlength.nil? == false
finalArr.delete_if {|word| word.length > options.maxlength}
end
# remove words that don't have at least 3 out of 4 cases (number, special, upper, lower)
if options.default
finalArr.delete_if {|word| not_complex(word)}
end
if options.lower
finalArr.map!(&:downcase)
end
# sort and uniq
if options.usergenerate
retArr = finalArr
else
retArr = finalArr.uniq.sort
end
@finalArr = (@finalArr + retArr)
return retArr
end
# check if a word doesn't meet Windows default complexity requirements of 3 out of 4 cases
def not_complex(word)
cases = 0
if (word =~ /\d/) then cases += 1 end # numerics
if (word =~ /[a-z]/) then cases += 1 end # lowers
if (word =~ /[A-Z]/) then cases += 1 end # uppers
if (word =~ /[^a-zA-Z\d]/) then cases += 1 end # specials
if cases < 3 then return true else return false end
end
# easy way to read files line-by-line into an array
def file_to_arr(f)
file = File.open(f, "rb", :encoding => "UTF-8") # possibly change to ISO-8859-1:UTF-8?
contents = file.read
file.close
arr = contents.downcase.split("\n")
return arr
end
# out to file
def output(file)
@finalArr.sort!.uniq!
@finalArr = @finalArr.reject {|el| el.empty?}
File.open(file,"w" ) do |f|
@finalArr.each {|line| f.puts(line)}
end
puts ""
puts "[*] #{@finalArr.length} words written to: #{Dir.pwd}/#{file}"
end
# the cia.txt files are structured, this breaks them down into a list of unique words
def format_cia_demographics(contents)
word_array = Array.new
symbols_arr = ["\,","$",";",".","!","(",")","*","%","@","^","&",":","\'","\"","/","\\","|","[","]","=","\”","\“","?","<a","+"]
contents.split(" ").each do |word|
symbols_arr.each do |symbol|
word.gsub!(symbol,"")
word_array.push word
end
end
word_array.delete_if { |string| string.include?("hrefrankorder") }
word_array.sort!.uniq!
word_string = word_array.join("\n")
return word_string
end
def generate_users(dir_path, options)
fnames = []
lnames = []
data = []
# enum the first names if we're using an option that includes full first names, otherwise return the alphabet
if options.genfirstname
if File.exist?("#{dir_path}/fnames.txt") == false then return end
if options.namedepth == 0
fnames = File.foreach("#{dir_path}/fnames.txt")
else
fnames = File.foreach("#{dir_path}/fnames.txt").first(options.namedepth)
end
else
fnames = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
end
# enum the last names if we're using an option that includes full last names, otherwise return the alphabet
if not options.fnli
if File.exist?("#{dir_path}/lnames.txt") == false then return end
if options.namedepth == 0
lnames = File.foreach("#{dir_path}/lnames.txt")
else
lnames = File.foreach("#{dir_path}/lnames.txt").first(options.namedepth)
end
else
lnames = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
end
# first last
if options.filn or options.fnln or options.fnli
fnames.each {|fname|
lnames.each {|lname|
data.push("#{fname.chomp}#{lname.chomp}") } }
end
# last first
if options.lnfi or options.lnfn
fnames.each {|fname|
lnames.each {|lname|
data.push("#{lname.chomp}#{fname.chomp}") } }
end
# first dot last
if options.fidln or options.fndln
fnames.each {|fname|
lnames.each {|lname|
data.push("#{fname.chomp}.#{lname.chomp}") } }
end
if not data.empty?
# truncate words if specified
if options.truncate
data.map! {|word| word[0...options.truncate] }
end
# only keep maxusers number of usernames
if options.maxusers
data = data[0...options.maxusers]
end
datapp = post_processor(dir_path, options, data)
if not options.quiet
puts ""
puts datapp
end
puts "usernames in #{dir_path}: #{datapp.length}"
end
end
# most of the work is done here, search a directory for flat files, bring them into memory
def get_attribute(dir_path, options, type)
if type == "other"
all_files = Dir.glob("#{dir_path}/**/*.txt")
attrs = ["areacodes","cia","cities","colleges","counties","fnames","landmarks","lands","lnames","places","roads","sports","structures","waters","zipcodes"]
attrs.each do |attribute|
all_files.delete_if{ |f| f.include?("#{attribute}.txt") }
end
else
all_files = Dir.glob("#{dir_path}/**/#{type}.txt")
end
data = []
all_files.each do |f|
file = File.open(f, "rb", :encoding => "ISO-8859-1:UTF-8")
contents = ""
# if the file has cia.txt in the name, open the file and put each word in that file on a new line
if f.include?("cia.txt")
cia = file.read
contents = format_cia_demographics(cia)
# code that we'll save for later, performance is too slow for now
# this parsed fnames.txt files when the files were formatted like:
# m,Bob,1239823092
# f,Alice,834903480
# etc.
# since then, we've gone back to just fnames.txt
# if getting first names, check whether we want female, male, or both
#elsif f.include?("fnames.txt")
# ignore sex
#if options.fnames
# while line = file.gets
# contents += line.split(",")[1] + "\n"
# end
#else
# while line = file.gets
# # female first names
# if options.ffnames
# if line.start_with?("f,")
# contents += line.split(",")[1] + "\n"
# end
# # male first names
# elsif options.mfnames
# if line.start_with?("m,")
# contents += line.split(",")[1] + "\n"
# end
# end
# end
#end
else
contents = file.read
end
file.close
data = data + contents.split("\n")
end
if not data.empty?
datapp = post_processor(dir_path, options, data)
if not options.quiet
puts ""
puts datapp
end
puts "#{type} in #{dir_path}: #{datapp.length}"
end
end
# open country.yaml and identify what the two most popular religions are
def find_religion(dir_path, options)
#yaml_file = dir_path + "/" + dir_path.split("/")[2] + ".yaml"
yaml_file = "./data/" + dir_path.split("/")[2] + "/" + dir_path.split("/")[2] + ".yaml"
if File.exist?(yaml_file) == false then return end
config = YAML.load_file(yaml_file)
religion_1 = config["config"]["religion_1"]
religion_2 = config["config"]["religion_2"]
if not religion_1.nil? then @religionArr.push(religion_1.downcase) end
if not religion_2.nil? then @religionArr.push(religion_2.downcase) end
end
# if religious text exists, import it in
def get_religion(dir_path, options)
data = Array.new
# this might take some explaining
# essentially, each country.yaml file will have the two most popular religions
# however, there are multiple "children" religions that are based off a "root" religion
# a good example is that presbyterian's, evangelical's and orthodox's will mostly reference the KJV or DR
# these religions.conf files act as a way to look up if the religion in a country.yaml matches a line in the religions.conf file
# it's probably the cleanest way to do this and affords some expandability
bible_religions = file_to_arr("./data/religion/bible-religions.conf")
quran_religions = file_to_arr("./data/religion/quran-religions.conf")
@religionArr.each do |religion|
# include both the kjv and dr for any catholic and christian based religion
if religion != nil && bible_religions.include?(religion)
data = data + file_to_arr("./data/religion/king-james-bible-parsed.txt")
data = data + file_to_arr("./data/religion/douay-rheims-parsed.txt")
data = data + file_to_arr("./data/religion/new-international-version-bible-parsed.txt")
data = data + file_to_arr("./data/religion/king-james-bible-book-verse.txt")
end
# include the quran for islam/muslim based religion
if religion != nil && quran_religions.include?(religion)
data = data + file_to_arr("./data/religion/quran-parsed-eng.txt")
end
end
if not data.empty?
datapp = post_processor(nil, options, data)
if not options.quiet
puts ""
puts datapp
end
puts "religions: #{datapp.length}"
end
end
# open country.yaml and identify what the two most popular languages are
def find_language(dir_path, options)
#yaml_file = dir_path + "/" + dir_path.split("/")[2] + ".yaml"
yaml_file = "./data/" + dir_path.split("/")[2] + "/" + dir_path.split("/")[2] + ".yaml"
if File.exist?(yaml_file) == false then return end
config = YAML.load_file(yaml_file)
language_1 = config["config"]["language_1"]
language_2 = config["config"]["language_2"]
if not language_1.nil? then @languageArr.push(language_1.downcase) end
if not language_2.nil? then @languageArr.push(language_2.downcase) end
end
# if a languages text exists, import it in
def get_language(dir_path, options)
data = Array.new
languages = Array.new
# get a list of all language files
language_list = Dir.glob("./data/languages/*.txt")
for i in 0 .. language_list.length - 1
# format path/language.txt to just language
languages.push language_list[i].split("/")[3].split(".")[0].downcase
end
@languageArr.each do |language|
# special dictionary for cantonese and mandarin
if language != nil && language == "mandarin" || language == "cantonese" || language == "chinese"
data = data + file_to_arr("./data/languages/cedict.txt")
end
# include the language
if language != nil && languages.include?(language)
data = data + file_to_arr("./data/languages/#{language}.txt")
end
end
if not data.empty?
datapp = post_processor(dir_path, options, data)
if not options.quiet
puts ""
puts datapp
end
puts "languages: #{datapp.length}"
end
end
def cycle(options, dir_path)
if options.all or options.cia then get_attribute(dir_path, options, "cia") end
if options.all or options.cities then get_attribute(dir_path, options, "cities") end
if options.all or options.colleges then get_attribute(dir_path, options, "colleges") end
if options.all or options.counties then get_attribute(dir_path, options, "counties") end
if options.all or options.landmarks then get_attribute(dir_path, options, "landmarks") end
if options.all or options.lands then get_attribute(dir_path, options, "lands") end
# code we'll save for later if we bring back male/female name separation
#if options.all or options.fnames or options.ffnames or options.mfnames then get_attribute(dir_path, options, "fnames") end
if options.all or options.fnames then get_attribute(dir_path, options, "fnames") end
if options.all or options.lnames then get_attribute(dir_path, options, "lnames") end
if options.all or options.other then get_attribute(dir_path, options, "other") end
if options.all or options.phone then get_attribute(dir_path, options, "areacodes") end
if options.all or options.places then get_attribute(dir_path, options, "places") end
if options.all or options.roads then get_attribute(dir_path, options, "roads") end
if options.all or options.sports then get_attribute(dir_path, options, "sports") end
if options.all or options.structures then get_attribute(dir_path, options, "structures") end
if options.all or options.waters then get_attribute(dir_path, options, "waters") end
if options.all or options.zip then get_attribute(dir_path, options, "zipcodes") end
if options.all or options.religion then find_religion(dir_path, options) end
if options.all or options.language then find_language(dir_path, options) end
if options.usergenerate then generate_users(dir_path, options) end
end
# abcdefghijklmnopqr tuvwxyz
# s <-- available options
# A C EFG I LM R WXYZ <-- available options
# ABCDEFGHIJKLMNOPQRSTUVWXYZ
def main()
options = OpenStruct.new
ARGV << '-h' if ARGV.empty?
OptionParser.new do |opt|
opt.banner = "Usage: ruby wordsmith.rb [options]"
opt.on('Main Arguments:')
opt.on('-I', '--input <input>', Array, 'Comma-delimited list of inputs, see -E for examples and detailed usage') { |o| options.input = o }
opt.on('Input Options:')
opt.on('-a', '--all', 'Grab all options') { |o| options.all = o }
opt.on('-b', '--other', 'Grab other miscellaneous attributes') { |o| options.other = o }
opt.on('-e', '--cia', 'Grab demographics compiled by the CIA') { |o| options.cia = o }
opt.on('-c', '--cities', 'Grab all city names') { |o| options.cities = o }
opt.on('-f', '--colleges', 'Grab all college sports') { |o| options.colleges = o }
opt.on('-l', '--landmarks', 'Grab all landmarks') { |o| options.landmarks = o }
opt.on('-v', '--language', 'Grab the most popular language(s)') { |o| options.language = o }
opt.on('-N', '--all-names', 'Grab all first names and last names') { |o| options.anames = o }
opt.on('-G', '--first-names', 'Grab all first names') { |o| options.fnames = o }
opt.on('-L', '--last-names', 'Grab all last names') { |o| options.lnames = o }
# saving for later usage
#opt.on('-F', '--female-fnames', 'Grab all female first names') { |o| options.ffnames = o }
#opt.on('-M', '--male-fnames', 'Grab all male first names') { |o| options.mfnames = o }
opt.on('-p', '--phone', 'Grab all area codes') { |o| options.phone = o }
opt.on('-r', '--roads', 'Grab all road names') { |o| options.roads = o }
opt.on('-g', '--religion', 'Grab the most popular religious text(s)') { |o| options.religion = o }
opt.on('-t', '--teams', 'Grab all major sports teams') { |o| options.sports = o }
opt.on('-u', '--counties', 'Grab all counties') { |o| options.counties = o }
opt.on('-z', '--zip', 'Grab all zip codes') { |o| options.zip = o }
opt.on('--lands', 'Grab all land features') { |o| options.lands = o }
opt.on('--places', 'Grab all populated places') { |o| options.places = o }
opt.on('--structures', 'Grab all structures/buildings') { |o| options.structures = o }
opt.on('--waters', 'Grab all water/island features') { |o| options.waters = o }
opt.on('Username Generation Options:')
opt.on('--filn', 'FirstInitialLastName (bsmith)') { |o| options.filn = o }
opt.on('--fnln', 'FirstNameLastName (bobsmith)') { |o| options.fnln = o }
opt.on('--fnli', 'FirstNameLastInitial (bobs)') { |o| options.fnli = o }
opt.on('--lnfi', 'LastNameFirstInitial (smithb)') { |o| options.lnfi = o }
opt.on('--lnfn', 'LastNameFirstName (smithbob)') { |o| options.lnfn = o }
opt.on('--fidln', 'FirstInitial.LastName (b.smith)') { |o| options.fidln = o }
opt.on('--fndln', 'FirstName.LastName (bob.smith)') { |o| options.fndln = o }
# saving for later usage?
#opt.on('--lndfi', 'LastName.FirstInitial (smith.b)') { |o| options.lndfi = o }
#opt.on('--lndfn', 'LastName.FirstName (smith.bob)') { |o| options.lndfn = o }
opt.on('--truncate LEN', Integer, 'Truncate username at LEN number of characters (bobsmi)') { |o| options.truncate = o }
opt.on('--max-users LEN', Integer, 'Max number of usernames to generate') { |o| options.maxusers = o }
opt.on('--name-depth LEN', Integer, 'Num of first/last names to iterate over (default:100, 0 will get all)') { |o| options.namedepth = o }
# options for adding a suffix or prefix to usernames?
opt.on('Web Scrape Options:')
opt.on('-d', '--domain DOMAIN', 'Set a URL for a web application that you want CeWL to scrape') { |o| options.url = o }
opt.on('-i', '--infile FILE', 'Supply a file containing multiple URLs that you want CeWL to scrape') { |o| options.multi = o }
opt.on('Output Options:')
opt.on('-o', '--output FILE', 'The filename for writing output') { |o| options.out = o }
opt.on('-q', '--quiet', 'Don\'t show words generated, use with -o option') { |o| options.quiet = o }
opt.on('-k', '--min-length LEN', Integer, 'Minimum length of word to include') { |o| options.minlength = o }
opt.on('-n', '--max-length LEN', Integer, 'Maximum length of word to include') { |o| options.maxlength = o }
opt.on('-D', '--complexity', 'Words must meet Windows default complexity (8 char min, 3/4 cases)') { |o| options.default = o }
opt.on('-j', '--lowercase', 'Convert all words to lowercase') { |o| options.lower = o }
opt.on('-w', '--specials', 'Add words with special characters removed') { |o| options.specials = o }
opt.on('-x', '--spaces', 'Add words with spaces removed') { |o| options.spaces = o }
opt.on('-y', '--split', 'Split words by space and add') { |o| options.split = o }
opt.on('-m', '--mangle', 'Add all permutations (-w, -x, -y)') { |o| options.mangle = o }
opt.on('-P', '--prepend-phones', 'Prepend state area codes to each generated word') { |o| options.prependphone = o }
opt.on('-A', '--append-phones', 'Append state area codes to each generated word') { |o| options.appendphone = o }
opt.on('-X', '--prepend-zips', 'Prepend zip codes to each generated word') { |o| options.prependzip = o }
opt.on('-Z', '--append-zips', 'Append zip codes to each generated word') { |o| options.appendzip = o }
opt.on('-W', '--prepend-wordlist FILE', 'Prepend words in FILE to each generated word') { |o| options.prependwordlist = o }
opt.on('-Y', '--append-wordlist FILE', 'Append words in FILE to each generated word') { |o| options.appendwordlist = o }
opt.on('Info Options:')
opt.on('-C', '--show-child-nodes', 'Show all possible child nodes for each input') { |o| options.showchildren = o }
opt.on('-E', '--examples', 'Show some usage examples and detailed explanations about using wordsmith') { |o| options.examples = o }
opt.on('-R', '--show-regions', 'Show regions mapping') { |o| options.showregions = o }
end.parse!
if options.examples then examples() end
if options.showregions then show_regions() end
if options.prependwordlist
if File.exist?(options.prependwordlist) == false
puts "#{options.prependwordlist} does not exist! Exiting."
exit 1
end
end
if options.appendwordlist
if File.exist?(options.appendwordlist) == false
puts "#{options.appendwordlist} does not exist! Exiting."
exit 1
end
end
# turn on first names and last names for all names
if options.all or options.anames
options.fnames = true
options.lnames = true
end
# turn on all manipulation switches for mangle
if options.mangle
options.spaces = true
options.split = true
options.specials = true
end
# if one of these flags set, then we'll generate usernames
if options.filn or options.fnln or options.fnli or options.lnfi or options.lnfn or options.fidln or options.fndln
options.usergenerate = true
end
# check options for ones that use a full first name
if options.fnln or options.lnfn or options.fndln or options.fnli
options.genfirstname = true
end
# minlength will override the default of 8
if options.default and not options.minlength
options.minlength = 8
end
# set default for name depth, meaning we'll use 100 first names and 100 last names during generation
if not options.namedepth then options.namedepth = 100 end
@finalArr = []
@religionArr = []
@languageArr = []
# boundaries and most populous countries
if not options.input.nil?
# if input is a number, then use most populous country data
if options.input.length == 1 and options.input[0].scan(/\D/).empty?
if File.exist?('./data/most-populous-countries.csv') == false
puts "./data/most-populous-countries.dat does not exist! Exiting."
exit 1
else
populouscountries = CSV.read('./data/most-populous-countries.csv').first(options.input[0].to_i)
puts "Grabbing data from the #{options.input[0]} most populous countries:"
options.input.clear
# we cleared out options.input array, replace with countries
populouscountries.each do |country|
puts "#{country[0]}"
options.input.push(country[0])
end
end
end
inputs = validate_boundaries(options)
if options.showchildren then show_children(inputs) end
inputs.each do |i|
cycle(options, i)
end
# do religions and languages as a whole at the end
if not @religionArr.empty?
@religionArr.sort!.uniq!
get_religion(nil, options)
end
if not @languageArr.empty?
@languageArr.sort!.uniq!
get_language(nil, options)
end
end
# web scraping
if not options.url.nil? then scrapeSingle(options) end
if not options.multi.nil? then scrapeMultiple(options) end
# output
if not options.out.nil? then output(options.out) end
end
def checkFiles()
dataCheck = "./data/usa/az/roads.txt"
regionsCheck = "./data/regions.csv"
if File.exist?(dataCheck) == false || File.exist?(regionsCheck) == false
archiveCheck = "data.tar.xz"
if File.exist?(archiveCheck) == false
puts "[!] data/regions.csv and data.tar.xz not detected! Try downloading data.tar.xz:"
puts " $ wget https://github.com/skahwah/wordsmith/releases/download/#{@version}/data.tar.xz"
puts ""
puts "[!] If data.tar.xz is downloaded, then try changing to the wordsmith directory."
abort
elsif File.exist?(dataCheck) == false
puts "[*] Hello new wordsmither!"
printf "[*] This script will remove the data/ directory in the current working directory. Enter 'y' to continue: "
prompt = STDIN.gets.chomp
if prompt != 'y'
puts "[!] Aborted!"
abort
end