Skip to content

Commit

Permalink
ORC-1012: Support specifying columns in orc-scan (#921)
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

The PR adds an option to support specifying columns to be read in the orc-scan tool.

### Why are the changes needed?

Currently the orc-scan tool will scan all columns. I find it useful to specifying columns when profiling read performance on specified data types.

### How was this patch tested?

Manually tested with the new option.
Added tests in tools/test/TestFileScan.cc.

(cherry picked from commit 9dcd645)
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
stiga-huang authored and dongjoon-hyun committed Dec 15, 2021
1 parent 5e4b3e9 commit 43b53ea
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 4 deletions.
25 changes: 21 additions & 4 deletions tools/src/FileScan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@
#include <iostream>
#include <string>

void scanFile(std::ostream & out, const char* filename, uint64_t batchSize) {
void scanFile(std::ostream & out, const char* filename, uint64_t batchSize,
const orc::RowReaderOptions& rowReaderOpts) {
orc::ReaderOptions readerOpts;
std::unique_ptr<orc::Reader> reader =
orc::createReader(orc::readFile(filename), readerOpts);
std::unique_ptr<orc::RowReader> rowReader = reader->createRowReader();
std::unique_ptr<orc::RowReader> rowReader = reader->createRowReader(rowReaderOpts);
std::unique_ptr<orc::ColumnVectorBatch> batch =
rowReader->createRowBatch(batchSize);

Expand All @@ -48,14 +49,17 @@ int main(int argc, char* argv[]) {
static struct option longOptions[] = {
{"help", no_argument, ORC_NULLPTR, 'h'},
{"batch", required_argument, ORC_NULLPTR, 'b'},
{"columns", required_argument, ORC_NULLPTR, 'c'},
{ORC_NULLPTR, 0, ORC_NULLPTR, 0}
};
bool helpFlag = false;
uint64_t batchSize = 1024;
std::list<uint64_t> cols;
orc::RowReaderOptions rowReaderOptions;
int opt;
char *tail;
do {
opt = getopt_long(argc, argv, "hb:", longOptions, ORC_NULLPTR);
opt = getopt_long(argc, argv, "hb:c:", longOptions, ORC_NULLPTR);
switch (opt) {
case '?':
case 'h':
Expand All @@ -69,19 +73,32 @@ int main(int argc, char* argv[]) {
return 1;
}
break;
case 'c': {
char *col = std::strtok(optarg, ",");
while (col) {
cols.push_back(static_cast<uint64_t>(std::atoi(col)));
col = std::strtok(ORC_NULLPTR, ",");
}
if (!cols.empty()) {
rowReaderOptions.include(cols);
}
break;
}
default: break;
}
} while (opt != -1);
argc -= optind;
argv += optind;

if (argc < 1 || helpFlag) {
std::cerr << "Usage: orc-scan [-h] [--help]\n"
<< " [-c 1,2,...] [--columns=1,2,...]\n"
<< " [-b<size>] [--batch=<size>] <filename>\n";
return 1;
} else {
for(int i=0; i < argc; ++i) {
try {
scanFile(std::cout, argv[i], batchSize);
scanFile(std::cout, argv[i], batchSize, rowReaderOptions);
} catch (std::exception& ex) {
std::cerr << "Caught exception in " << argv[i]
<< ": " << ex.what() << "\n";
Expand Down
18 changes: 18 additions & 0 deletions tools/test/TestFileScan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,22 @@ TEST (TestFileScan, testNominal) {
EXPECT_EQ("Rows: 32768\nBatches: 33\n", output);
EXPECT_EQ("", error);

EXPECT_EQ(0, runProgram({pgm, std::string("-c"), std::string("1,2,3,9"), file},
output, error));
EXPECT_EQ("Rows: 32768\nBatches: 33\n", output);
EXPECT_EQ("", error);

EXPECT_EQ(0, runProgram({pgm, std::string("-b"), std::string("256"), file},
output, error));
EXPECT_EQ("Rows: 32768\nBatches: 131\n", output);
EXPECT_EQ("", error);

EXPECT_EQ(0, runProgram({pgm, std::string("-b"), std::string("256"),
std::string("-c"), std::string("1,2,3"), file},
output, error));
EXPECT_EQ("Rows: 32768\nBatches: 131\n", output);
EXPECT_EQ("", error);

EXPECT_EQ(0, runProgram({pgm, std::string("-b256"), file}, output, error));
EXPECT_EQ("Rows: 32768\nBatches: 131\n", output);
EXPECT_EQ("", error);
Expand All @@ -52,6 +63,11 @@ TEST (TestFileScan, testNominal) {
output, error));
EXPECT_EQ("Rows: 32768\nBatches: 131\n", output);
EXPECT_EQ("", error);

EXPECT_EQ(0, runProgram({pgm, std::string("--batch=256"),
std::string("--columns=1,2,3"), file},output, error));
EXPECT_EQ("Rows: 32768\nBatches: 131\n", output);
EXPECT_EQ("", error);
}

/**
Expand Down Expand Up @@ -104,6 +120,7 @@ TEST (TestFileScan, testBadCommand) {
EXPECT_EQ("", output);
EXPECT_EQ("orc-scan: option requires an argument -- b\n"
"Usage: orc-scan [-h] [--help]\n"
" [-c 1,2,...] [--columns=1,2,...]\n"
" [-b<size>] [--batch=<size>] <filename>\n",
removeChars(stripPrefix(error, "orc-scan: "),"'`"));

Expand All @@ -122,6 +139,7 @@ TEST (TestFileScan, testBadCommand) {
EXPECT_EQ("", output);
EXPECT_EQ("orc-scan: option --batch requires an argument\n"
"Usage: orc-scan [-h] [--help]\n"
" [-c 1,2,...] [--columns=1,2,...]\n"
" [-b<size>] [--batch=<size>] <filename>\n",
removeChars(stripPrefix(error, "orc-scan: "), "'`"));

Expand Down

0 comments on commit 43b53ea

Please sign in to comment.