multi gpu, update ncnn

nihui · Jul 11, 2020 · 04f93f6 · 04f93f6
1 parent 1a7c13e
commit 04f93f6
Show file tree

Hide file tree

Showing 6 changed files with 146 additions and 56 deletions.
diff --git a/src/main.cpp b/src/main.cpp
@@ -53,8 +53,40 @@ static wchar_t getopt(int argc, wchar_t* const argv[], const wchar_t* optstring)
 
     return opt;
 }
+
+static std::vector<int> parse_optarg_int_array(const wchar_t* optarg)
+{
+    std::vector<int> array;
+    array.push_back(_wtoi(optarg));
+
+    const wchar_t* p = wcschr(optarg, L',');
+    while (p)
+    {
+        p++;
+        array.push_back(_wtoi(p));
+        p = wcschr(p, L',');
+    }
+
+    return array;
+}
 #else // _WIN32
 #include <unistd.h> // getopt()
+
+static std::vector<int> parse_optarg_int_array(const char* optarg)
+{
+    std::vector<int> array;
+    array.push_back(atoi(optarg));
+
+    const char* p = strchr(optarg, ',');
+    while (p)
+    {
+        p++;
+        array.push_back(atoi(p));
+        p = strchr(p, ',');
+    }
+
+    return array;
+}
 #endif // _WIN32
 
 // ncnn
@@ -75,10 +107,10 @@ static void print_usage()
     fprintf(stderr, "  -o output-path       output image path (png/webp) or directory\n");
     fprintf(stderr, "  -n noise-level       denoise level (-1/0/1/2/3/4/5/6/7/8/9/10, default=3)\n");
     fprintf(stderr, "  -s scale             upscale ratio (2/3/4, default=2)\n");
-    fprintf(stderr, "  -t tile-size         tile size (>=32/0=auto, default=0)\n");
+    fprintf(stderr, "  -t tile-size         tile size (>=32/0=auto, default=0) can be 0,0,0 for multi-gpu\n");
     fprintf(stderr, "  -m model-path        srmd model path (default=models-srmd)\n");
-    fprintf(stderr, "  -g gpu-id            gpu device to use (default=0)\n");
-    fprintf(stderr, "  -j load:proc:save    thread count for load/proc/save (default=1:2:2)\n");
+    fprintf(stderr, "  -g gpu-id            gpu device to use (default=auto) can be 0,1,2 for multi-gpu\n");
+    fprintf(stderr, "  -j load:proc:save    thread count for load/proc/save (default=1:2:2) can be 1:2,2,2:2 for multi-gpu\n");
     fprintf(stderr, "  -x                   enable tta mode\n");
     fprintf(stderr, "  -f format            output image format (png/webp, default=ext/png)\n");
 }
@@ -374,11 +406,11 @@ int main(int argc, char** argv)
     path_t outputpath;
     int noise = 3;
     int scale = 2;
-    int tilesize = 0;
+    std::vector<int> tilesize;
     path_t model = PATHSTR("models-srmd");
-    int gpuid = 0;
+    std::vector<int> gpuid;
     int jobs_load = 1;
-    int jobs_proc = 2;
+    std::vector<int> jobs_proc;
     int jobs_save = 2;
     int verbose = 0;
     int tta_mode = 0;
@@ -404,16 +436,17 @@ int main(int argc, char** argv)
             scale = _wtoi(optarg);
             break;
         case L't':
-            tilesize = _wtoi(optarg);
+            tilesize = parse_optarg_int_array(optarg);
             break;
         case L'm':
             model = optarg;
             break;
         case L'g':
-            gpuid = _wtoi(optarg);
+            gpuid = parse_optarg_int_array(optarg);
             break;
         case L'j':
-            swscanf(optarg, L"%d:%d:%d", &jobs_load, &jobs_proc, &jobs_save);
+            swscanf(optarg, L"%d:%*[^:]:%d", &jobs_load, &jobs_save);
+            jobs_proc = parse_optarg_int_array(wcschr(optarg, L':') + 1);
             break;
         case L'f':
             format = optarg;
@@ -449,16 +482,17 @@ int main(int argc, char** argv)
             scale = atoi(optarg);
             break;
         case 't':
-            tilesize = atoi(optarg);
+            tilesize = parse_optarg_int_array(optarg);
             break;
         case 'm':
             model = optarg;
             break;
         case 'g':
-            gpuid = atoi(optarg);
+            gpuid = parse_optarg_int_array(optarg);
             break;
         case 'j':
-            sscanf(optarg, "%d:%d:%d", &jobs_load, &jobs_proc, &jobs_save);
+            sscanf(optarg, "%d:%*[^:]:%d", &jobs_load, &jobs_save);
+            jobs_proc = parse_optarg_int_array(strchr(optarg, ':') + 1);
             break;
         case 'f':
             format = optarg;
@@ -489,18 +523,42 @@ int main(int argc, char** argv)
         return -1;
     }
 
-    if (tilesize != 0 && tilesize < 32)
+    if (tilesize.size() != (gpuid.empty() ? 1 : gpuid.size()) && !tilesize.empty())
     {
         fprintf(stderr, "invalid tilesize argument\n");
         return -1;
     }
 
-    if (jobs_load < 1 || jobs_proc < 1 || jobs_save < 1)
+    for (int i=0; i<(int)tilesize.size(); i++)
+    {
+        if (tilesize[i] != 0 && tilesize[i] < 32)
+        {
+            fprintf(stderr, "invalid tilesize argument\n");
+            return -1;
+        }
+    }
+
+    if (jobs_load < 1 || jobs_save < 1)
     {
         fprintf(stderr, "invalid thread count argument\n");
         return -1;
     }
 
+    if (jobs_proc.size() != (gpuid.empty() ? 1 : gpuid.size()) && !jobs_proc.empty())
+    {
+        fprintf(stderr, "invalid jobs_proc thread count argument\n");
+        return -1;
+    }
+
+    for (int i=0; i<(int)jobs_proc.size(); i++)
+    {
+        if (jobs_proc[i] < 1)
+        {
+            fprintf(stderr, "invalid jobs_proc thread count argument\n");
+            return -1;
+        }
+    }
+
     if (!path_is_directory(outputpath))
     {
         // guess format from outputpath no matter what format argument specified
@@ -605,49 +663,82 @@ int main(int argc, char** argv)
 
     ncnn::create_gpu_instance();
 
+    if (gpuid.empty())
+    {
+        gpuid.push_back(ncnn::get_default_gpu_index());
+    }
+
+    const int use_gpu_count = (int)gpuid.size();
+
+    if (jobs_proc.empty())
+    {
+        jobs_proc.resize(use_gpu_count, 2);
+    }
+
+    if (tilesize.empty())
+    {
+        tilesize.resize(use_gpu_count, 0);
+    }
+
     int cpu_count = std::max(1, ncnn::get_cpu_count());
     jobs_load = std::min(jobs_load, cpu_count);
     jobs_save = std::min(jobs_save, cpu_count);
 
     int gpu_count = ncnn::get_gpu_count();
-    if (gpuid < 0 || gpuid >= gpu_count)
+    for (int i=0; i<use_gpu_count; i++)
     {
-        fprintf(stderr, "invalid gpu device\n");
+        if (gpuid[i] < 0 || gpuid[i] >= gpu_count)
+        {
+            fprintf(stderr, "invalid gpu device\n");
 
-        ncnn::destroy_gpu_instance();
-        return -1;
+            ncnn::destroy_gpu_instance();
+            return -1;
+        }
     }
 
-    int gpu_queue_count = ncnn::get_gpu_info(gpuid).compute_queue_count;
-    jobs_proc = std::min(jobs_proc, gpu_queue_count);
+    int total_jobs_proc = 0;
+    for (int i=0; i<use_gpu_count; i++)
+    {
+        int gpu_queue_count = ncnn::get_gpu_info(gpuid[i]).compute_queue_count;
+        jobs_proc[i] = std::min(jobs_proc[i], gpu_queue_count);
+        total_jobs_proc += jobs_proc[i];
+    }
 
-    if (tilesize == 0)
+    for (int i=0; i<use_gpu_count; i++)
     {
-        uint32_t heap_budget = ncnn::get_gpu_device(gpuid)->get_heap_budget();
+        if (tilesize[i] != 0)
+            continue;
+
+        uint32_t heap_budget = ncnn::get_gpu_device(gpuid[i])->get_heap_budget();
 
         // more fine-grained tilesize policy here
         if (model.find(PATHSTR("models-srmd")) != path_t::npos)
         {
             if (heap_budget > 2600)
-                tilesize = 400;
+                tilesize[i] = 400;
             else if (heap_budget > 740)
-                tilesize = 200;
+                tilesize[i] = 200;
             else if (heap_budget > 250)
-                tilesize = 100;
+                tilesize[i] = 100;
             else
-                tilesize = 32;
+                tilesize[i] = 32;
         }
     }
 
     {
-        SRMD srmd(gpuid, tta_mode);
+        std::vector<SRMD*> srmd(use_gpu_count);
 
-        srmd.load(parampath, modelpath);
+        for (int i=0; i<use_gpu_count; i++)
+        {
+            srmd[i] = new SRMD(gpuid[i], tta_mode);
 
-        srmd.noise = noise;
-        srmd.scale = scale;
-        srmd.tilesize = tilesize;
-        srmd.prepadding = prepadding;
+            srmd[i]->load(parampath, modelpath);
+
+            srmd[i]->noise = noise;
+            srmd[i]->scale = scale;
+            srmd[i]->tilesize = tilesize[i];
+            srmd[i]->prepadding = prepadding;
+        }
 
         // main routine
         {
@@ -661,13 +752,22 @@ int main(int argc, char** argv)
             ncnn::Thread load_thread(load, (void*)&ltp);
 
             // srmd proc
-            ProcThreadParams ptp;
-            ptp.srmd = &srmd;
+            std::vector<ProcThreadParams> ptp(use_gpu_count);
+            for (int i=0; i<use_gpu_count; i++)
+            {
+                ptp[i].srmd = srmd[i];
+            }
 
-            std::vector<ncnn::Thread*> proc_threads(jobs_proc);
-            for (int i=0; i<jobs_proc; i++)
+            std::vector<ncnn::Thread*> proc_threads(total_jobs_proc);
             {
-                proc_threads[i] = new ncnn::Thread(proc, (void*)&ptp);
+                int total_jobs_proc_id = 0;
+                for (int i=0; i<use_gpu_count; i++)
+                {
+                    for (int j=0; j<jobs_proc[i]; j++)
+                    {
+                        proc_threads[total_jobs_proc_id++] = new ncnn::Thread(proc, (void*)&ptp[i]);
+                    }
+                }
             }
 
             // save image
@@ -686,12 +786,12 @@ int main(int argc, char** argv)
             Task end;
             end.id = -233;
 
-            for (int i=0; i<jobs_proc; i++)
+            for (int i=0; i<total_jobs_proc; i++)
             {
                 toproc.put(end);
             }
 
-            for (int i=0; i<jobs_proc; i++)
+            for (int i=0; i<total_jobs_proc; i++)
             {
                 proc_threads[i]->join();
                 delete proc_threads[i];
@@ -708,6 +808,12 @@ int main(int argc, char** argv)
                 delete save_threads[i];
             }
         }
+
+        for (int i=0; i<use_gpu_count; i++)
+        {
+            delete srmd[i];
+        }
+        srmd.clear();
     }
 
     ncnn::destroy_gpu_instance();

diff --git a/src/ncnn b/src/ncnn
diff --git a/src/srmd_postproc.comp b/src/srmd_postproc.comp
@@ -14,10 +14,6 @@
 
 layout (constant_id = 0) const int bgr = 0;
 
-layout (local_size_x_id = 233) in;
-layout (local_size_y_id = 234) in;
-layout (local_size_z_id = 235) in;
-
 layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
 layout (binding = 1) readonly buffer alpha_blob { sfp alpha_blob_data[]; };
 #if NCNN_int8_storage

diff --git a/src/srmd_postproc_tta.comp b/src/srmd_postproc_tta.comp
@@ -14,10 +14,6 @@
 
 layout (constant_id = 0) const int bgr = 0;
 
-layout (local_size_x_id = 233) in;
-layout (local_size_y_id = 234) in;
-layout (local_size_z_id = 235) in;
-
 layout (binding = 0) readonly buffer bottom_blob0 { sfp bottom_blob0_data[]; };
 layout (binding = 1) readonly buffer bottom_blob1 { sfp bottom_blob1_data[]; };
 layout (binding = 2) readonly buffer bottom_blob2 { sfp bottom_blob2_data[]; };

diff --git a/src/srmd_preproc.comp b/src/srmd_preproc.comp
@@ -14,10 +14,6 @@
 
 layout (constant_id = 0) const int bgr = 0;
 
-layout (local_size_x_id = 233) in;
-layout (local_size_y_id = 234) in;
-layout (local_size_z_id = 235) in;
-
 #if NCNN_int8_storage
 layout (binding = 0) readonly buffer bottom_blob { uint8_t bottom_blob_data[]; };
 #else

diff --git a/src/srmd_preproc_tta.comp b/src/srmd_preproc_tta.comp
@@ -14,10 +14,6 @@
 
 layout (constant_id = 0) const int bgr = 0;
 
-layout (local_size_x_id = 233) in;
-layout (local_size_y_id = 234) in;
-layout (local_size_z_id = 235) in;
-
 #if NCNN_int8_storage
 layout (binding = 0) readonly buffer bottom_blob { uint8_t bottom_blob_data[]; };
 #else