forked from b-k/apophenia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapop_sort.m4.c
232 lines (202 loc) · 10.1 KB
/
apop_sort.m4.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
/** \file apop_sort.c
Copyright (c) 2013 by Ben Klemens. Licensed under the GPLv2; see COPYING. */
#include "apop_internal.h"
#include <stdbool.h>
static double find_smallest_larger_than(apop_data const *sort_order, double *x){
//the next column in the sort order is the one that is not NAN, greater than x, but smaller than all other candidate values.
double candidate_col=-100, candidate_val = INFINITY, v;
if (sort_order->vector && !isnan(v=gsl_vector_get(sort_order->vector, 0))
&& v > *x && v < candidate_val){
candidate_val = v;
candidate_col = -1;
}
if (sort_order->matrix)
for (int i=0; i< sort_order->matrix->size2; i++)
if (!isnan(v=gsl_matrix_get(sort_order->matrix, 0, i))
&& v > *x && v < candidate_val){
candidate_val = v;
candidate_col = i;
}
if (*sort_order->textsize)
for (int i=0; i< sort_order->textsize[1]; i++)
if (apop_opts.nan_string && strcmp(sort_order->text[0][i], apop_opts.nan_string)
&& (v=atof(sort_order->text[0][i])) > *x && v < candidate_val){
candidate_val = v;
candidate_col = i+0.5;
}
if (sort_order->weights && !isnan(v=gsl_vector_get(sort_order->weights, 0))
&& v > *x && v < candidate_val){
candidate_val = v;
candidate_col = -2;
}
if (sort_order->names->rowct)
if (apop_opts.nan_string && strcmp(*sort_order->names->row, apop_opts.nan_string)
&& (v=atof(*sort_order->names->row)) > *x && v < candidate_val){
candidate_val = v;
candidate_col = 0.2;
}
*x = candidate_val;
return candidate_col;
}
static void generate_sort_order(apop_data const *data, apop_data const *sort_order, int cols_to_sort_ct, double *so){
/* the internal rule is that the vector is -1, the weights vector is -2, the names are
* 0.2, and the text cols are the column+0.5. How's that for arbitrary. */
if (sort_order) {
double x = -INFINITY;
for (int i=0; i< cols_to_sort_ct-1; i++)
so[i] = find_smallest_larger_than(sort_order, &x);
} else {
int ctr=0;
if (data->vector) so[ctr++] = -1;
if (data->matrix) for(int i=0; i<data->matrix->size2; i++) so[ctr++] = i;
if (*data->textsize) for(int i=0; i< data->textsize[1]; i++) so[ctr++] = i+0.5;
if (data->weights) so[ctr++] = -2;
if (data->names->rowct) so[ctr++] = 0.2;
}
so[cols_to_sort_ct-1] = -100;
}
#include <gsl/gsl_sort_vector.h>
static int find_min_unsorted(size_t *sorted, size_t height, size_t min){
while (min<height)
if (!sorted[min]) return min;
else min++;
return -1;
}
static threadlocal apop_data *d; //stdlib qsort doesn't have a hook where we can put these.
static threadlocal int offset;
static int compare_strings(const void *a, const void *b) {
const size_t *da = (const size_t *) a;
const size_t *db = (const size_t *) b;
return offset==-1
? strcasecmp(d->names->row[*da], d->names->row[*db])
: strcasecmp(d->text[*da][offset], d->text[*db][offset]);
}
static void rearrange(apop_data *data, size_t height, size_t *perm){
size_t i, start=0;
size_t *sorted = calloc(height, sizeof(size_t));
while (1){
i =
start = find_min_unsorted(sorted, height, start);
if (i==-1) break;
apop_data *first_row_storage = apop_data_copy(Apop_r(data, start));
sorted[start]++;
while (perm[i]!=start){
//copy from perm[i] to i
apop_data_memcpy(Apop_r(data,i), Apop_r(data, perm[i]));
sorted[perm[i]]++;
i = perm[i];
}
apop_data_memcpy(Apop_r(data, i), first_row_storage);
apop_data_free(first_row_storage);
}
free(sorted);
}
/** Sort an \ref apop_data set on an arbitrary sequence of columns.
The \c sort_order set is a one-row data set that should look like the data set being
sorted. The easiest way to generate it is to use \ref Apop_r to pull one row of the
table, then copy and fill it. For each column you want used in the sort, assign a ranking giving whether the column should be sorted first, second, .... Columns you don't want used in the sorting should be set to \c NAN. Ties are broken by the earlier element in the default order (see below).
E.g., to sort by the last column of a five-column matrix first, then the next-to-last column, then the next-to-next-to-last, then by the first text column, then by the second text column:
\code
apop_data *sort_order = apop_data_copy(Apop_r(data, 0));
sort_order->vector = NULL; //so it will be skipped.
Apop_data_fill(sort_order, NAN, NAN, 3, 2, 1);
apop_text_set(sort_order, 0, 0, "4");
apop_text_set(sort_order, 0, 1, "5");
apop_data_sort(data, sort_order);
\endcode
To determine which columns are sorted at which step, I use only comparisons, not the actual numeric values. For example, (1, 2, 3) and (-1.32, 0, 27) work identically. For text, I use \c atof to convert the your text to a number, as in the example above that set text values of \c "4" and \c "5". A blank string, NaN numeric value, or NULL element in the \ref apop_data set means that column will not be sorted.
\li Strings are sorted case-insensitively, using \c strcasecmp. [exercise for the reader: modify the source to use Glib's locale-correct string sorting.]
\li The setup generates a lexicographic sort using the columns you specify. If you would like a different sort order, such as Euclidian distance to the origin, you can generate a new column expressing your preferred metric, and then sorting on that. See the example below.
\param data The data set to be sorted. If \c NULL, this function is a no-op that returns \c NULL.
\param sort_order An \ref apop_data set describing the order in which columns are used for sorting, as above. If \c NULL, then sort by the vector, then each matrix column, then text, then weights, then row names.
\param inplace If 'n', make a copy, else sort in place. (default: 'y').
\param asc If 'a', ascending; if 'd', descending. This is applied to all columns; column-by-column application is to do. (default: 'a').
\param col_order For internal use only. In your call, it should be \c NULL; you can leave this off your function call entirely and the \ref designated syntax will takes care of it for you.
\return A pointer to the sorted data set. If <tt>inplace=='y'</tt> (the default), then this is the same as the input set.
A few examples:
\include "sort_example.c"
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD apop_data *apop_data_sort(apop_data *data, apop_data *sort_order, char asc, char inplace, double *col_order){
apop_data * apop_varad_var(data, NULL);
Apop_stopif(!data, return NULL, 1, "You gave me NULL data to sort. Returning NULL");
apop_data * apop_varad_var(sort_order, NULL);
char apop_varad_var(inplace, 'y');
char apop_varad_var(asc, 'a');
double * apop_varad_var(col_order, NULL);
APOP_VAR_ENDHEAD
if (!data) return NULL;
apop_data *out = inplace=='n' ? apop_data_copy(data) : data;
apop_data *xx = sort_order ? sort_order : out;
Get_vmsizes(xx); //firstcol, msize2
int cols_to_sort_ct = msize2 - firstcol +1 + !!(xx->weights) + xx->textsize[1] + !!xx->names->rowct;
double so[cols_to_sort_ct];
if (!col_order){
generate_sort_order(out, sort_order, cols_to_sort_ct, so);
col_order = so;
}
bool is_text = ((int)*col_order != *col_order);
bool is_name = (*col_order == 0.2);
gsl_vector_view c;
gsl_vector *cc = NULL;
if (!is_text && *col_order>=0){
c = gsl_matrix_column(out->matrix, *col_order);
cc = &c.vector;
}
gsl_vector *thiscol = cc ? cc
: (*col_order==-2) ? out->weights
: (*col_order==-1) ? out->vector
: NULL;
size_t height = thiscol ? thiscol->size
: is_name ? out->names->rowct
: *out->textsize;
gsl_permutation *p = gsl_permutation_alloc(height);
if (!is_text) gsl_sort_vector_index (p, thiscol);
else {
gsl_permutation_init(p);
d = out;
offset = is_name ? -1 : *col_order-0.5;
qsort(p->data, height, sizeof(size_t), compare_strings);
}
size_t *perm = p->data;
if (asc=='d' || asc=='D') //reverse the perm matrix.
for (size_t j=0; j< height/2; j++){
double t = perm[j];
perm[j] = perm[height-1-j];
perm[height-1-j] = t;
}
rearrange(out, height, perm);
gsl_permutation_free(p);
if (col_order[1] == -100) return out;
/*Second pass:
find blocks where all are of the same value.
After you pass a block of size > 1 row where all vals in this col are identical,
sort that block, using the rest of the sort order. */
int bottom=0;
if (!is_text){
double last_val = gsl_vector_get(thiscol, 0);
for (int i=1; i< height+1; i++){
double this_val=0;
if ((i==height || (this_val=gsl_vector_get(thiscol, i)) != last_val)
&& bottom != i-1){
apop_data_sort_base(Apop_rs(out, bottom, i-bottom), sort_order, 'a', 'y', col_order+1);
}
if (last_val != this_val) bottom = i;
last_val = this_val;
}
} else {
char *last_val = strdup(is_name ? out->names->row[0] : out->text[0][(int)(*col_order-0.5)]);
for (int i=1; i< height+1; i++){
char *this_val = i==height ? NULL : is_name ? out->names->row[i] : out->text[i][(int)(*col_order-0.5)];
if ((i==height || strcasecmp(this_val, last_val))
&& bottom != i-1){
apop_data_sort_base(Apop_rs(out, bottom, i-bottom), sort_order, 'a', 'y', col_order+1);
}
if (this_val && strcmp(last_val, this_val)) bottom = i;
free(last_val);
last_val = this_val ? strdup(this_val) : NULL;
}
free(last_val);
}
return out;
}