Skip to content

Commit

Permalink
udpate
Browse files Browse the repository at this point in the history
  • Loading branch information
dr-who committed Feb 12, 2024
1 parent 1ec5bd5 commit c0cc299
Show file tree
Hide file tree
Showing 5 changed files with 295 additions and 1 deletion.
5 changes: 4 additions & 1 deletion spit/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ include (CTest)

#enable_testing()

set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -Wshadow -Wextra -Wall -pedantic -Wstrict-prototypes -O2 -msse4.2" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -Wshadow -Wextra -Wall -pedantic -Wstrict-prototypes " )
#SET (CMAKE_C_COMPILER "/usr/bin/clang")


Expand All @@ -35,6 +35,9 @@ add_library(spitlib STATIC positions.c devices.c utils.c diskStats.c logSpeed.c
add_executable(spit spit.c spit-version.h)
target_link_libraries(spit spitlib m aio pthread numa)

add_executable(dataset datasetMain.c dataset.c dataset.h numList.c numList.h)
target_link_libraries(dataset m)

add_executable(modcat modcat.c)
#target_link_libraries()

Expand Down
2 changes: 2 additions & 0 deletions spit/advertise-mc.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ void *advertiseMC(void *arg) {
if (now - last > 60) {
// every 60s scan
blockDevicesScan(bd);
free(adv_ip);
adv_ip = interfaceIPNonWifi(n); // pickup a changing main IP
}


Expand Down
214 changes: 214 additions & 0 deletions spit/dataset.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
#include <stdlib.h>
#include <malloc.h>
#include <assert.h>
#include <string.h>
#include <math.h>

#include "dataset.h"


datasetType *datasetInit(void) {
datasetType *res = calloc(sizeof(datasetType), 1); assert(res);
return res;
}

void datasetFree(datasetType **din) {
datasetType *d = *din;
for (size_t i = 0; i < d->rows;i++) {
for (size_t j = 0;j < d->columns; j++) {
if (d->category[j] == LABEL) {
free(d->data[i][j].v.label);
}
}
free(d->data[i]);
}
free(d->data);

for (size_t j = 0;j < d->columns; j++) {
free(d->name[j]);
}
free(d->category);
free(d->name);


free(d);
*din = NULL;
}

void datasetAddColumn(datasetType *d) {
d->name = realloc(d->name, (d->columns + 1) * sizeof(char*)); assert(d->name);
d->category = realloc(d->category, (d->columns + 1) * sizeof(enum thetype)); assert(d->category);
}


void datasetAddColumnLabel(datasetType *d, char *name) {
datasetAddColumn(d);
d->name[d->columns] = strdup(name);
d->category[d->columns] = LABEL;
d->columns++;
}

void datasetAddColumnNumber(datasetType *d, char *name) {
datasetAddColumn(d);
d->name[d->columns] = strdup(name);
d->category[d->columns] = NUMBER;
d->columns++;
}

void datasetAddColumnPrediction(datasetType *d, char *name) {
for (size_t i = 0; i < d->columns ;i++) {
if (d->category[i] == PREDICTION) { // already has a prediction
fprintf(stderr,"*info* there is already a prediction column: %s\n", d->name[i]);
}
}
datasetAddColumn(d);
d->name[d->columns] = strdup(name);
d->category[d->columns] = PREDICTION;
d->columns++;
}


void datasetAddRow(datasetType *d, attrType *row) {
d->data = realloc(d->data, ((d->rows + 1) * sizeof(attrType*)));
assert (d->data);
d->data[d->rows] = row;
d->rows++;
}

// datasetInitHeader(d, "l:country n:age\t n:amount \tn:items p:class\t\tp:spend2\n");
void datasetInitHeader(datasetType *d, char *headline) {
char *dup = strdup(headline);
char *tok = NULL;
int first = 1;
while ((tok = strtok(first ? dup : NULL, " \t,\r\n")) != NULL) {

char *p = strchr(tok, ':');
if (tok[0] == 'l') {
datasetAddColumnLabel(d, p+1);
} else if (tok[0] == 'n') {
datasetAddColumnNumber(d, p+1);
} else if (tok[0] == 'p') {
datasetAddColumnPrediction(d, p+1);
} else {
fprintf(stderr,"not sure what to do with '%s'\n", tok);
}
first = 0;
}
free(dup);
}


void datasetAddDataLine(datasetType *d, char *line) {
char *dup = strdup(line);
int first = 1;
char *tok = NULL;

attrType *row = calloc(d->columns, sizeof(attrType)); assert(row);

size_t column = 0;
while ((tok = strtok(first ? dup : NULL, " \t,")) != NULL) {
int nn =0;
if (strcmp(tok, "?")==0) {
nn = 1;
}

assert(column < d->columns);
if (d->category[column] == LABEL) {
row[column].v.label = nn ? NULL : strdup(tok);
} else {
row[column].v.number = nn ? NAN : atof(tok);
}
column++;
first = 0;
}
datasetAddRow(d, row);
free(dup);
}

// datasetAddDataString(d, "usa 32 1 3 50\n");
void datasetAddDataString(datasetType *d, char *string) {
if (d->columns <= 0) {
fprintf(stderr, "A header is required first\n");
} else {
char *pos = string, *nl = NULL;
while ((nl = strchr(pos, '\n')) != NULL) {
char *row = calloc(nl - pos + 1+1, 1);
snprintf(row, nl - pos+1, "%s", pos);

datasetAddDataLine(d, row); // will parse and create
pos += (nl - pos) + 1;
free(row);
}
}
}



void datasetDumpJSON(datasetType *d) {
printf("{ \n");
for (size_t i = 0; i < d->columns ;i++) {
printf("\t");
printf("\"%s\": ", d->name[i]);
if (d->category[i] == LABEL) {
printf("\"%s\"", "Label");
} else if (d->category[i] == PREDICTION) {
printf("\"%s\"", "Prediction");
} else if (d->category[i] == NUMBER) {
printf("\"%s\"", "Number");
}
if (i < d->columns) {
printf(",");
}
printf("\n");
}
printf("\t\"data\": [\n");
for (size_t i = 0;i < d->rows; i++) {
printf("{\n");
for (size_t j = 0; j < d->columns ;j++) {
printf("\t\"%s\": ", d->name[j]);
if (d->category[j] == LABEL) {
printf("\"%s\"", d->data[i][j].v.label ? d->data[i][j].v.label : "?");
} else {
if (isnan(d->data[i][j].v.number)) {
printf("\"?\"");
} else {
printf("%lf", d->data[i][j].v.number);
}
}
if (j < d->columns -1) {
printf(",");
}
printf("\n");
}
printf("}");
if (i < d->rows-1) {
printf(",");
}
printf("\n");
}
printf("]\n");

printf("}\n");
}


#include "numList.h"

void datasetStats(datasetType *d) {
for (size_t i = 0; i < d->columns ;i++) {
printf("[%zd] %s (%d)\n", i, d->name[i], d->category[i]);
if (d->category[i] != LABEL) {
numListType nl;
nlInit(&nl, 100000);
for (size_t j = 0; j < d->rows; j++) {
// fprintf(stderr,"rows %zd\n", j);
// fprintf(stderr," number %lf\n", d->data[j][i].v.number);
if (!isnan(d->data[j][i].v.number)) {
nlAdd(&nl, d->data[j][i].v.number);
}
}
nlSummary(&nl);
nlFree(&nl);
}
}
}
48 changes: 48 additions & 0 deletions spit/dataset.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#ifndef __DATASET_H
#define __DATASET_H

#include <unistd.h>

enum thetype {
LABEL, // unique like name or IP
ENUM, // string class
NUMBER,
ZNUMBER,
DERIVED,
PREDICTION
};

typedef struct {
union {
char *label; // NULL means missing
double number; // NAN means missing
} v;
double mean;
double sd;
} attrType;

typedef struct {
size_t columns;
char **name;
enum thetype *category;

size_t rows;
attrType **data;
} datasetType;

datasetType * datasetInit(void);

void datasetAddColumnLabel(datasetType *d, char *name);
void datasetAddColumnNumber(datasetType *d, char *name);
void datasetAddColumnPrediction(datasetType *d, char *name);
void datasetFree(datasetType **din);

void datasetDumpJSON(datasetType *d);
void datasetAddRow(datasetType *d, attrType *row);

void datasetInitHeader(datasetType *d, char *headerline);
void datasetAddDataString(datasetType *d, char *string);

void datasetStats(datasetType *d);

#endif
27 changes: 27 additions & 0 deletions spit/datasetMain.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#include <assert.h>
#include <malloc.h>
#include <string.h>

#include "dataset.h"

int main(int argc, char *argv[]) {
(void)argc;
(void)argv;

datasetType *d = datasetInit();


datasetInitHeader(d, "l:country n:age\t n:amount \tn:items p:class\t\n");

datasetAddDataString(d, "? 1 3 4 5\r\n3 4 2 3 ?\nbungy 3 4 5 sisd\nbongo,3,45,5,99\nok 4 5 6 98\n");

datasetDumpJSON(d);

datasetStats(d);


datasetFree(&d);
assert(d==NULL);

return 0;
}

0 comments on commit c0cc299

Please sign in to comment.