-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathsplit_sample.py
executable file
·122 lines (108 loc) · 3.4 KB
/
split_sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os, sys
import argparse
import random
class MainApplication(object):
data = []
def __init__(self, args):
self.args = args
def read_data(self):
def read_line_data():
for line in self.args.infile:
self.data.append(line)
def read_block_data():
cache = ""
for line in self.args.infile:
cache += line
if len(line.strip()) == 0:
self.data.append(cache)
cache = ""
if self.args.unit == "line":
read_line_data()
elif self.args.unit == "block":
read_block_data()
def get_random_subsamples(self):
# adapted from: http://codereview.stackexchange.com/questions/4872/pythonic-split-list-into-n-random-chunks-of-roughly-equal-size
n = self.args.num
size = self.args.size if self.args.size else len(self.data) / n
if n * size > len(self.data):
sys.stderr.write("Warning: samples will overlap\n")
sys.stderr.flush()
step = (len(self.data) - size) / (n - 1)
else:
step = size
if not self.args.continuous:
random.shuffle(self.data)
for c in range(0, n * step, step):
yield self.data[c : (c + size)]
def write_subsamples(self):
n = 0
for subsample in self.get_random_subsamples():
n += 1
with open("%s_%i" % (self.args.prefix, n), "w") as f:
f.write("".join(subsample))
def run(self):
self.read_data()
self.write_subsamples()
if __name__ == "__main__":
description = "Randomly splits up a sample file into subsamples of equal length to be used for k-fold cross-validation."
epilog = ""
parser = argparse.ArgumentParser(description=description, epilog=epilog)
parser.add_argument(
"infile",
metavar="INPUT",
nargs="?",
type=argparse.FileType("r"),
default=sys.stdin,
help="Input file (default: STDIN)",
)
parser.add_argument(
"prefix", metavar="PREFIX", help="Prefix for generated subsample files"
)
parser.add_argument(
"-c",
"--continuous",
action="store_true",
default=False,
help="Make continuous samples, don't shuffle",
)
parser.add_argument(
"-e",
"--encoding",
metavar="ENC",
default="utf-8",
help="Encoding of the input file (default: %(default)s)",
)
parser.add_argument(
"-n",
"--num",
metavar="N",
type=int,
default=10,
help="Number of subsamples to create (default: %(default)d)",
)
parser.add_argument(
"-s",
"--size",
metavar="N",
type=int,
help="Force subsamples to be of a specific size",
)
parser.add_argument(
"-u",
"--unit",
choices=("line", "block"),
default="line",
help="Unit of data points in the sample file. "
+ '"line" uses lines as data points; '
+ '"block" uses blocks of lines separated by a blank line. '
+ "(default: %(default)s)",
)
args = parser.parse_args()
# launching application ...
try:
MainApplication(args).run()
except SystemExit:
sys.stderr.write("\nThere were errors.\n")
sys.stderr.flush()