Skip to content

Commit

Permalink
Merge pull request #193 from friskluft/POL3837_polus_fpattern_support
Browse files Browse the repository at this point in the history
POL3837 Polus-style file patterns in 3D
  • Loading branch information
sameeul authored Jan 23, 2024
2 parents effc33c + 75acac2 commit 0ace25f
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 54 deletions.
2 changes: 1 addition & 1 deletion src/nyx/environment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ bool Environment::parse_cmdline(int argc, char** argv)
case 3:
if (check_3d_file_pattern(rawFilePattern) == false)
{
std::cerr << "Error: invalid 3D file pattern '" << rawFilePattern << "' \n";
std::cerr << "Error: invalid 3D file pattern " << rawFilePattern << " : " << this->file_pattern_3D.get_ermsg() << '\n';
return false;
}
break;
Expand Down
2 changes: 1 addition & 1 deletion src/nyx/environment_basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ bool BasicEnvironment::check_2d_file_pattern(const std::string& pat)

bool BasicEnvironment::check_3d_file_pattern(const std::string& pat)
{
return file_pattern_3D.set_pattern (pat);
return file_pattern_3D.set_filepattern (pat);
}

std::string BasicEnvironment::get_file_pattern()
Expand Down
3 changes: 2 additions & 1 deletion src/nyx/python/new_bindings_py.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,9 @@ py::tuple featurize_directory_3D_imp(
theEnvironment.set_dim(3);

// Check and cache the file pattern
std::string ermsg;
if (!theEnvironment.check_3d_file_pattern(file_pattern))
throw std::invalid_argument("Invalid file pattern " + file_pattern);
throw std::invalid_argument("Invalid file pattern " + file_pattern + " : " + theEnvironment.file_pattern_3D.get_ermsg());

// No need to set the raw file pattern separately for 3D
// theEnvironment.set_file_pattern(file_pattern);
Expand Down
166 changes: 122 additions & 44 deletions src/nyx/strpat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,63 @@ StringPattern::StringPattern()
{
}

// Example of a valid pat: BRATS_{d+}_z{set d+}_t{d+}.ome.tif for BRATS_001_z004_t002.ome.tif
bool StringPattern::set_filepattern (const std::string & pat)
{
// parse a Polus-style filepattern
const std::string magicAnyStr = "mzmzmzmzmzmzmzmzmzmzmzm", // a string highly unlikely to happen to be a part of file name
magicAnyNum = "18446744073709551615" "000", // int value that will never occur (max 64-bit int \times 10^3)
magicStarNum = "18446744073709551615" "111"; // int value that will never occur (max 64-bit int \times 10^3 + 111)

// replace all {d+} with NUM
std::string repl1 = std::regex_replace (pat, std::regex("\\{d\\+\\}"), magicAnyNum);

// replace all {c+} with TEXT
std::string repl2 = std::regex_replace (repl1, std::regex("\\{c\\+\\}"), magicAnyStr);

// replace all {set d+} or its variant {set,d+} with =*
std::string repl3 = std::regex_replace (repl2, std::regex("\\{set d\\+\\}"), magicStarNum);
std::string repl4 = std::regex_replace (repl3, std::regex("\\{set,d\\+\\}"), magicStarNum);

// validate all the expressions in curly brackets
if (repl4.find("{") != std::string::npos || repl4.find("}") != std::string::npos)
{
ermsg_ = "illegal {Expression}. Only {d+}, {c+}, and {set d+} or {set,d+} are permitted";
return false;
}

// now lexify this file pattern into a raw pattern to produce a grammar
std::vector<std::string> tokCodes;
std::vector<std::string> tokVals;

bool ok = tokenize(
repl4,
tokCodes,
tokVals);

if (!ok)
return false;

std::string join;
for (int i = 0; i < tokCodes.size(); i++)
{
if (tokVals[i] == magicAnyStr || tokVals[i] == magicAnyNum)
join += tokCodes[i];
else
if(tokVals[i] == magicStarNum)
join += tokCodes[i] + "=*";
else
join += tokCodes[i] + "=" + tokVals[i];
join += " ";
}

ok = set_raw_pattern(join);

return ok;
}

// initialize the file pattern object with a string
bool StringPattern::set_pattern(const std::string& s)
bool StringPattern::set_raw_pattern (const std::string& s)
{
// Cache the pattern string no matter if it's correct or not
cached_pattern_string = s;
Expand All @@ -15,7 +70,6 @@ bool StringPattern::set_pattern(const std::string& s)
if (!filepatt_to_grammar(s, grammar_, ermsg_))
{
good_ = false;
ermsg_ = "tokenize error";
return false;
}

Expand All @@ -37,6 +91,18 @@ std::string StringPattern::get_ermsg() const
return ermsg_;
}

std::string StringPattern::get_term_context (const std::string & term)
{
size_t idxFound = term.find("=");
if (idxFound != std::string::npos)
{
std::string tc = term.substr(idxFound + 1, term.size() - 1);
return tc;
}
else
return "";
}

// returns true if a string matches the pattern
bool StringPattern::match (const std::string& s, std::map<std::string, std::vector<std::string>> & imgDirs, std::string & external_ermsg) const
{
Expand All @@ -53,8 +119,12 @@ bool StringPattern::match (const std::string& s, std::map<std::string, std::vect
return false;
}

// check if 's' matches the grammar in the number of tokens
if (tokCodes.size() != grammar_.size())
return false;

// check the file name string versus a grammar of 3D layout A
std::vector<std::string> state;
std::string aggrValue;
std::string mapKey;

// check grammar
Expand All @@ -67,7 +137,7 @@ bool StringPattern::match (const std::string& s, std::map<std::string, std::vect
std::string pureTerm = term,
termContext;

// if we have a token with a qualifying constant, tear the token off
// if we have a token with a qualifying constant, tear the token apart into a pure term and its context
size_t idxFound = term.find("=");

bool haveEq = false;
Expand All @@ -81,7 +151,7 @@ bool StringPattern::match (const std::string& s, std::map<std::string, std::vect
// grammar check
if (tokCodes[i] != pureTerm)
{
external_ermsg = "after " + mapKey + " expecting " + pureTerm + " while actual is " + tokCodes[i] + " (" + tokVals[i] + "), so skipping file " + s;
external_ermsg = "skipping " + mapKey + tokCodes[i] + " not ending " + pureTerm;
return false;
}

Expand All @@ -92,55 +162,49 @@ bool StringPattern::match (const std::string& s, std::map<std::string, std::vect
}
else
{
if (pureTerm == "TEXT")
// token not matching the qualifier ?
if (termContext != "*" && termContext != tokVals[i])
{
external_ermsg = "skipping " + mapKey + termContext + " not ending " + tokVals[i];
return false;
}

// ok, matching

if (pureTerm == t_TEXT)
{
mapKey += tokVals[i];
state.push_back(termContext); // for example "z"
//---state.push_back(termContext); // for example "z"
continue;
}
if (pureTerm == "NUM")
if (pureTerm == t_SEP)
{
mapKey += (termContext == "*" ? termContext : tokVals[i]);
state.push_back(tokVals[i]); // for example "457"
mapKey += tokVals[i];
continue;
}
if (pureTerm == "#")
if (pureTerm == t_NUM)
{
// merge the rest of the input string with the mapping key and quit traversing the grammar
for (int j = i; j < tokCodes.size(); j++)
mapKey += tokCodes[i];
break; // quit the grammar check
mapKey += (termContext == "*" ? termContext : tokVals[i]);
aggrValue = tokVals[i]; // for example "0457" in "z0457"

continue;
}

}
} //- grammar walk

// if we are at this point, syntax is OK. Now update filename's association with a value mined from a set term
// no match with an aggregator?
if (state.size() == 0)
// make an aggregation action using an aggregator name-value(s) tuple cached in state
auto imdir = imgDirs.find(mapKey);
if (imdir == imgDirs.end())
{
external_ermsg = "expecting " + s + " to contain an aggregator";
return false;
std::vector zValues { aggrValue };
imgDirs[mapKey] = zValues;
}
// incomplete aggregator match?
if (state.size() == 1)
{
external_ermsg = "incomplete aggregator in " + s;
return false;
}
// make an aggregation action using an aggregator name-value(s) tuple cached in state
if (state[0] == "z")
else
{
auto imdir = imgDirs.find(mapKey);
if (imdir == imgDirs.end())
{
std::vector zValues{ state[1] };
imgDirs[mapKey] = zValues;
}
else
{
std::vector<std::string>& zValues = imdir->second;
zValues.push_back(state[1]);
}
std::vector<std::string>& zValues = imdir->second;
zValues.push_back (aggrValue);
}

return true;
Expand All @@ -157,9 +221,10 @@ bool StringPattern::tokenize (
// use std::vector instead, we need to have it in this order
std::vector<std::pair<std::string, std::string>> v
{
{ "[0-9]+" , "NUM" } ,
{ "[a-z]+|[A-Z]+" , "TEXT" },
{ "~|`|!|@|#|\\$|%|\\^|&|\\(|\\)|_|-|\\+|=|\\{|\\}|\\[|]|'|;|,|\\.", "SEP" }
{ "[0-9]+" , t_NUM } ,
{ "[a-z]+|[A-Z]+" , t_TEXT },
{ "~|`|!|@|#|\\$|%|\\^|&|\\(|\\)|_|-|\\+|=|\\{|\\}|\\[|]|'|;|,|\\.", t_SEP },
{ "\\*", t_STAR }
};

std::string reg;
Expand Down Expand Up @@ -199,26 +264,39 @@ bool StringPattern::tokenize (
bool StringPattern::filepatt_to_grammar(const std::string& filePatt, std::vector<std::string>& grammar, std::string& errMsg)
{
grammar.clear();
const char* delimiters = "_ -";
const char* delimiters = " ";
char* dupFP = strdup(filePatt.c_str());
char* token = std::strtok(dupFP, delimiters);
int n_aggrs = 0;
while (token)
{
std::string strToken = token;

// check
if (strToken.find("TEXT") != 0 && strToken.find("NUM") != 0 && strToken.find("SEP") != 0)
// check 1: illegal terms
if (strToken.find(t_TEXT) != 0 && strToken.find(t_NUM) != 0 && strToken.find(t_SEP) != 0)
{
errMsg = "error: " + strToken + " needs to be TEXT, NUM, or SEP";
return false;
}

// check 2: unique aggregator
std::string tc = get_term_context(strToken);
if (tc == "*")
n_aggrs++;

// save
grammar.push_back(strToken);
token = std::strtok(nullptr, delimiters);
}
free(dupFP);

// check 2: unique aggregator
if (n_aggrs != 1)
{
errMsg = "error: aggregator needs to be unique (actual count is " + std::to_string(n_aggrs) + ")";
return false;
}

return true;
}

Expand Down
23 changes: 17 additions & 6 deletions src/nyx/strpat.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,21 @@ class StringPattern
public:
StringPattern();

// initialize the file pattern object with a string
bool set_pattern(const std::string& s);
// Initialize the instance using a Polus-stype filepattern (example: BRATS_{d+}_z{set d+}_t{d+}.ome.tif)
// Error details are available via get_ermsg()
bool set_filepattern(const std::string & pat);

// returns whether the file pattern is initialized and usable
// Initialize the instance using anexplicit definition (example: "TEXT=BRATS SEP=_ NUM SEP=_ TEXT=z NUM=* SEP=_ TEXT=t NUM SEP=. TEXT=ome SEP=. TEXT=tif")
// Error details are available via get_ermsg()
bool set_raw_pattern(const std::string & pat);

// Returns whether the file pattern is initialized and usable
bool good() const;

// returns the last error message
// Returns the last error message
std::string get_ermsg() const;

// returns true if a string matches the pattern
// Returns true if a string matches the pattern
bool match (const std::string& s, std::map<std::string, std::vector<std::string>>& imgDirs, std::string& external_ermsg) const;

std::string get_cached_pattern_string() const;
Expand All @@ -28,12 +33,18 @@ class StringPattern
std::string ermsg_;
std::vector<std::string> grammar_;

// if successful, sets tokCodes and tokVals
// If successful, sets tokCodes and tokVals
bool tokenize(
const std::string & s,
std::vector<std::string> & tokCodes,
std::vector<std::string> & tokVals) const;

bool filepatt_to_grammar (const std::string& filePatt, std::vector<std::string>& grammar, std::string& errMsg);

std::string get_term_context (const std::string& term);

private:
// Terminals
const char *t_TEXT = "TEXT", *t_NUM = "NUM", *t_SEP = "SEP", *t_STAR = "STAR";

};
1 change: 0 additions & 1 deletion tests/test_initialization.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

using Nyxus::allocateTrivialRoisBuffers;
using Nyxus::freeTrivialRoisBuffers;
using namespace Nyxus;

void test_initialization() {

Expand Down

0 comments on commit 0ace25f

Please sign in to comment.