Skip to content

Commit

Permalink
multi gpu, update ncnn
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Jul 11, 2020
1 parent 1a7c13e commit 04f93f6
Show file tree
Hide file tree
Showing 6 changed files with 146 additions and 56 deletions.
184 changes: 145 additions & 39 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,40 @@ static wchar_t getopt(int argc, wchar_t* const argv[], const wchar_t* optstring)

return opt;
}

static std::vector<int> parse_optarg_int_array(const wchar_t* optarg)
{
std::vector<int> array;
array.push_back(_wtoi(optarg));

const wchar_t* p = wcschr(optarg, L',');
while (p)
{
p++;
array.push_back(_wtoi(p));
p = wcschr(p, L',');
}

return array;
}
#else // _WIN32
#include <unistd.h> // getopt()

static std::vector<int> parse_optarg_int_array(const char* optarg)
{
std::vector<int> array;
array.push_back(atoi(optarg));

const char* p = strchr(optarg, ',');
while (p)
{
p++;
array.push_back(atoi(p));
p = strchr(p, ',');
}

return array;
}
#endif // _WIN32

// ncnn
Expand All @@ -75,10 +107,10 @@ static void print_usage()
fprintf(stderr, " -o output-path output image path (png/webp) or directory\n");
fprintf(stderr, " -n noise-level denoise level (-1/0/1/2/3/4/5/6/7/8/9/10, default=3)\n");
fprintf(stderr, " -s scale upscale ratio (2/3/4, default=2)\n");
fprintf(stderr, " -t tile-size tile size (>=32/0=auto, default=0)\n");
fprintf(stderr, " -t tile-size tile size (>=32/0=auto, default=0) can be 0,0,0 for multi-gpu\n");
fprintf(stderr, " -m model-path srmd model path (default=models-srmd)\n");
fprintf(stderr, " -g gpu-id gpu device to use (default=0)\n");
fprintf(stderr, " -j load:proc:save thread count for load/proc/save (default=1:2:2)\n");
fprintf(stderr, " -g gpu-id gpu device to use (default=auto) can be 0,1,2 for multi-gpu\n");
fprintf(stderr, " -j load:proc:save thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu\n");
fprintf(stderr, " -x enable tta mode\n");
fprintf(stderr, " -f format output image format (png/webp, default=ext/png)\n");
}
Expand Down Expand Up @@ -374,11 +406,11 @@ int main(int argc, char** argv)
path_t outputpath;
int noise = 3;
int scale = 2;
int tilesize = 0;
std::vector<int> tilesize;
path_t model = PATHSTR("models-srmd");
int gpuid = 0;
std::vector<int> gpuid;
int jobs_load = 1;
int jobs_proc = 2;
std::vector<int> jobs_proc;
int jobs_save = 2;
int verbose = 0;
int tta_mode = 0;
Expand All @@ -404,16 +436,17 @@ int main(int argc, char** argv)
scale = _wtoi(optarg);
break;
case L't':
tilesize = _wtoi(optarg);
tilesize = parse_optarg_int_array(optarg);
break;
case L'm':
model = optarg;
break;
case L'g':
gpuid = _wtoi(optarg);
gpuid = parse_optarg_int_array(optarg);
break;
case L'j':
swscanf(optarg, L"%d:%d:%d", &jobs_load, &jobs_proc, &jobs_save);
swscanf(optarg, L"%d:%*[^:]:%d", &jobs_load, &jobs_save);
jobs_proc = parse_optarg_int_array(wcschr(optarg, L':') + 1);
break;
case L'f':
format = optarg;
Expand Down Expand Up @@ -449,16 +482,17 @@ int main(int argc, char** argv)
scale = atoi(optarg);
break;
case 't':
tilesize = atoi(optarg);
tilesize = parse_optarg_int_array(optarg);
break;
case 'm':
model = optarg;
break;
case 'g':
gpuid = atoi(optarg);
gpuid = parse_optarg_int_array(optarg);
break;
case 'j':
sscanf(optarg, "%d:%d:%d", &jobs_load, &jobs_proc, &jobs_save);
sscanf(optarg, "%d:%*[^:]:%d", &jobs_load, &jobs_save);
jobs_proc = parse_optarg_int_array(strchr(optarg, ':') + 1);
break;
case 'f':
format = optarg;
Expand Down Expand Up @@ -489,18 +523,42 @@ int main(int argc, char** argv)
return -1;
}

if (tilesize != 0 && tilesize < 32)
if (tilesize.size() != (gpuid.empty() ? 1 : gpuid.size()) && !tilesize.empty())
{
fprintf(stderr, "invalid tilesize argument\n");
return -1;
}

if (jobs_load < 1 || jobs_proc < 1 || jobs_save < 1)
for (int i=0; i<(int)tilesize.size(); i++)
{
if (tilesize[i] != 0 && tilesize[i] < 32)
{
fprintf(stderr, "invalid tilesize argument\n");
return -1;
}
}

if (jobs_load < 1 || jobs_save < 1)
{
fprintf(stderr, "invalid thread count argument\n");
return -1;
}

if (jobs_proc.size() != (gpuid.empty() ? 1 : gpuid.size()) && !jobs_proc.empty())
{
fprintf(stderr, "invalid jobs_proc thread count argument\n");
return -1;
}

for (int i=0; i<(int)jobs_proc.size(); i++)
{
if (jobs_proc[i] < 1)
{
fprintf(stderr, "invalid jobs_proc thread count argument\n");
return -1;
}
}

if (!path_is_directory(outputpath))
{
// guess format from outputpath no matter what format argument specified
Expand Down Expand Up @@ -605,49 +663,82 @@ int main(int argc, char** argv)

ncnn::create_gpu_instance();

if (gpuid.empty())
{
gpuid.push_back(ncnn::get_default_gpu_index());
}

const int use_gpu_count = (int)gpuid.size();

if (jobs_proc.empty())
{
jobs_proc.resize(use_gpu_count, 2);
}

if (tilesize.empty())
{
tilesize.resize(use_gpu_count, 0);
}

int cpu_count = std::max(1, ncnn::get_cpu_count());
jobs_load = std::min(jobs_load, cpu_count);
jobs_save = std::min(jobs_save, cpu_count);

int gpu_count = ncnn::get_gpu_count();
if (gpuid < 0 || gpuid >= gpu_count)
for (int i=0; i<use_gpu_count; i++)
{
fprintf(stderr, "invalid gpu device\n");
if (gpuid[i] < 0 || gpuid[i] >= gpu_count)
{
fprintf(stderr, "invalid gpu device\n");

ncnn::destroy_gpu_instance();
return -1;
ncnn::destroy_gpu_instance();
return -1;
}
}

int gpu_queue_count = ncnn::get_gpu_info(gpuid).compute_queue_count;
jobs_proc = std::min(jobs_proc, gpu_queue_count);
int total_jobs_proc = 0;
for (int i=0; i<use_gpu_count; i++)
{
int gpu_queue_count = ncnn::get_gpu_info(gpuid[i]).compute_queue_count;
jobs_proc[i] = std::min(jobs_proc[i], gpu_queue_count);
total_jobs_proc += jobs_proc[i];
}

if (tilesize == 0)
for (int i=0; i<use_gpu_count; i++)
{
uint32_t heap_budget = ncnn::get_gpu_device(gpuid)->get_heap_budget();
if (tilesize[i] != 0)
continue;

uint32_t heap_budget = ncnn::get_gpu_device(gpuid[i])->get_heap_budget();

// more fine-grained tilesize policy here
if (model.find(PATHSTR("models-srmd")) != path_t::npos)
{
if (heap_budget > 2600)
tilesize = 400;
tilesize[i] = 400;
else if (heap_budget > 740)
tilesize = 200;
tilesize[i] = 200;
else if (heap_budget > 250)
tilesize = 100;
tilesize[i] = 100;
else
tilesize = 32;
tilesize[i] = 32;
}
}

{
SRMD srmd(gpuid, tta_mode);
std::vector<SRMD*> srmd(use_gpu_count);

srmd.load(parampath, modelpath);
for (int i=0; i<use_gpu_count; i++)
{
srmd[i] = new SRMD(gpuid[i], tta_mode);

srmd.noise = noise;
srmd.scale = scale;
srmd.tilesize = tilesize;
srmd.prepadding = prepadding;
srmd[i]->load(parampath, modelpath);

srmd[i]->noise = noise;
srmd[i]->scale = scale;
srmd[i]->tilesize = tilesize[i];
srmd[i]->prepadding = prepadding;
}

// main routine
{
Expand All @@ -661,13 +752,22 @@ int main(int argc, char** argv)
ncnn::Thread load_thread(load, (void*)&ltp);

// srmd proc
ProcThreadParams ptp;
ptp.srmd = &srmd;
std::vector<ProcThreadParams> ptp(use_gpu_count);
for (int i=0; i<use_gpu_count; i++)
{
ptp[i].srmd = srmd[i];
}

std::vector<ncnn::Thread*> proc_threads(jobs_proc);
for (int i=0; i<jobs_proc; i++)
std::vector<ncnn::Thread*> proc_threads(total_jobs_proc);
{
proc_threads[i] = new ncnn::Thread(proc, (void*)&ptp);
int total_jobs_proc_id = 0;
for (int i=0; i<use_gpu_count; i++)
{
for (int j=0; j<jobs_proc[i]; j++)
{
proc_threads[total_jobs_proc_id++] = new ncnn::Thread(proc, (void*)&ptp[i]);
}
}
}

// save image
Expand All @@ -686,12 +786,12 @@ int main(int argc, char** argv)
Task end;
end.id = -233;

for (int i=0; i<jobs_proc; i++)
for (int i=0; i<total_jobs_proc; i++)
{
toproc.put(end);
}

for (int i=0; i<jobs_proc; i++)
for (int i=0; i<total_jobs_proc; i++)
{
proc_threads[i]->join();
delete proc_threads[i];
Expand All @@ -708,6 +808,12 @@ int main(int argc, char** argv)
delete save_threads[i];
}
}

for (int i=0; i<use_gpu_count; i++)
{
delete srmd[i];
}
srmd.clear();
}

ncnn::destroy_gpu_instance();
Expand Down
2 changes: 1 addition & 1 deletion src/ncnn
Submodule ncnn updated 793 files
4 changes: 0 additions & 4 deletions src/srmd_postproc.comp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@

layout (constant_id = 0) const int bgr = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) readonly buffer alpha_blob { sfp alpha_blob_data[]; };
#if NCNN_int8_storage
Expand Down
4 changes: 0 additions & 4 deletions src/srmd_postproc_tta.comp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@

layout (constant_id = 0) const int bgr = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob0 { sfp bottom_blob0_data[]; };
layout (binding = 1) readonly buffer bottom_blob1 { sfp bottom_blob1_data[]; };
layout (binding = 2) readonly buffer bottom_blob2 { sfp bottom_blob2_data[]; };
Expand Down
4 changes: 0 additions & 4 deletions src/srmd_preproc.comp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@

layout (constant_id = 0) const int bgr = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_int8_storage
layout (binding = 0) readonly buffer bottom_blob { uint8_t bottom_blob_data[]; };
#else
Expand Down
4 changes: 0 additions & 4 deletions src/srmd_preproc_tta.comp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@

layout (constant_id = 0) const int bgr = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_int8_storage
layout (binding = 0) readonly buffer bottom_blob { uint8_t bottom_blob_data[]; };
#else
Expand Down

0 comments on commit 04f93f6

Please sign in to comment.