maltezfaria · maltezfaria · Nov 17, 2023 · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023
diff --git a/docs/make.jl b/docs/make.jl
@@ -2,14 +2,16 @@ using DataFlowTasks
 using Documenter
 using Literate
 
+draft = false
+
 # generate examples
 for example in ["cholesky", "blur-roberts", "lcs", "sort"]
     println("\n*** Generating $example example")
     @time begin
         dir = joinpath(DataFlowTasks.PROJECT_ROOT, "docs", "src", "examples", example)
         src = joinpath(dir, "$(example).jl")
         Literate.markdown(src, dir)
-        Literate.notebook(src, dir)
+        draft || Literate.notebook(src, dir)
     end
 end
 
@@ -22,7 +24,7 @@ println("\n*** Generating README")
     include(src)
 
     # Generate notebook
-    Literate.notebook(src, pwd())
+    draft || Literate.notebook(src, pwd())
 
     # Generate markdown
     # -> fix image paths to link to github.io
@@ -71,14 +73,14 @@ makedocs(;
         "Getting started" => "index.md",
         "Debugging & Profiling" => "profiling.md",
         "Examples" => [
-            "examples/examples.md",
             "examples/cholesky/cholesky.md",
             "examples/blur-roberts/blur-roberts.md",
             "examples/lcs/lcs.md",
             "examples/sort/sort.md",
             # "examples/stencil/stencil.md",
             # "examples/lu/lu.md",
             # "examples/hmat/hmat.md",
+            "examples/hardware.md"
         ],
         # "Comparaison with Dagger.jl" => "dagger.md",
         # "Common Issues" => "issues.md",
@@ -87,6 +89,7 @@ makedocs(;
     ],
     warnonly = on_CI ? false : Documenter.except(:linkcheck_remotes),
     pagesonly = true,
+    draft,
 )
 
 deploydocs(;

diff --git a/docs/src/examples/examples.md b/docs/src/examples/examples.md
diff --git a/docs/src/examples/hardware.md b/docs/src/examples/hardware.md
@@ -0,0 +1,24 @@
+# [Hardware information](@id hardware-information)
+
+The examples and benchmarks here were generated using:
+
+## Version info
+
+```@example
+using InteractiveUtils
+versioninfo()
+```
+
+## Topology
+
+```@example
+using Hwloc
+topology_info()
+```
+
+## CPU
+
+```@example
+using CpuId
+printstyled(cpuinfo())
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -257,9 +257,16 @@ taskgraph
 
 Some current limitations are listed below:
 
-- There is no way to specify priorities for a task.
+- At present, there is no way to specify priorities for a task. We plan to
+  support such a feature in the near future. Check [this
+  issue](https://github.com/maltezfaria/DataFlowTasks.jl/issues/75) to track the
+  development.
 - The main thread executes tasks, and is responsible for adding/removing nodes
   from the DAG. This may hinder parallelism if the main thread is given a long
   task since the processing of the dag will halt until the main thread becomes
-  free again.
-- ...
+  free again. This could be a problem if the task execution times are *very
+  inhomogeneous*; a solution, not currently implemented, would be to allow for
+  blocking the main thread from executing tasks.
+- Nesting `DataFlowTask`s, although possible, can be tricky due to some
+  technical details of our current implementation. See [Nested Tasks](@ref
+  nested-tasks) for a more in-depth discussion.
diff --git a/docs/src/troubleshooting.md b/docs/src/troubleshooting.md
@@ -1,5 +1,34 @@
 # Troubleshooting / known issues
 
+## Suggested workflow
+
+Writing parallel code is hard, specially when it involves shared memory. While
+`DataFlowTasks` tries to alleviate some of the burden, it comes with its own
+sharp corners. A suggested workflow for writing parallel code with
+`DataFlowTasks` is the following:
+
+1. Write the serial code, make sure it works. This may sound obvious, but it is
+   easy to get carried away with the parallelization and forget this crucial
+   step!
+2. Add [`@dspawn`](@ref) to the chunks of your code that you want to run in
+   parallel. If it works as expected, you are an awesome programmer! But then,
+   probably you would not be reading this section, if that were the case, so
+   let's assume that you run into some issues.
+3. Run [`DataFlowTasks.force_sequential`](@ref)`(true)` and try your code again. This will
+   make `@dspawn` a no-op, and if your sequential code worked, it should work.
+4. Enable `@dspawn` again with `force_sequential(false)`, and run
+   [`DataFlowTasks.force_linear_dag`](@ref)`(true)`. This makes sure
+   `DataFlowTask`s are created and schedule, but it forces the underlying `DAG`
+   to be linear (i.e. node `i` is always connected to node `i+1`). This is
+   closer to the *real thing* than `force_sequential` since closures are created
+   and variables are capture in the task bodies. If you have an issue here,
+   consider reading the following section on [captured variables](@ref
+   troubleshooting-captures).
+5. Deactivate the `force_linear_dag(false)` and see results are correct. If they
+   are not, the problem is likely related to incorrectly declared data
+   dependencies in your `@dspawn` blocks. Look at the code, scratch your head,
+   and continue debugging...
+
 ## [Tricky behavior of captured variables](@id troubleshooting-captures)
 
 It is easy to forget that `DataFlowTasks.@dspawn`, like its siblings `@async` or
@@ -89,6 +118,145 @@ arr  # expect all 4 after the round-trip
 ```
 
 !!! note
-
     The [Parallel Merge Sort example](@ref example-sort) shows a real-world
     situation in which such issues could arise.
+
+## [Nested task graph](@id nested-tasks)
+
+It may sometimes be useful, or even necessary, to spawn a `DataFlowTask` inside
+another. This, although possible, can be a bit tricky to get right. To
+understand why that is the case, let us walk through a simple example:
+
+```@example nested-tasks
+
+using DataFlowTasks
+
+function nested_tasks()
+    A,B = ones(10), ones(10)
+    @dspawn begin
+        sleep(0.1)
+        @RW A B
+        @dspawn begin
+            @RW(view(A,1:5)) .= 0
+        end label = "1a"
+        @dspawn begin
+            @RW(view(A,6:10)) .= 0 
+        end label = "1b"
+        B .= 0
+    end label = "1"
+    res = @dspawn begin
+        (sum(@R(A)),sum(@R(B))) 
+    end label = "2"
+    fetch(res)
+end
+
+```
+
+If we were to disable `@dspawn` (make it a `no-op`) in the code above, the
+sequential execution would proceed as follows:
+
+1. `A` and `B` are initialized to ones.
+2. After a small nap, `A[1:5]` is filled with `0` in block `1a`
+3. `A[6:10]` is filled with `0` in block `1b`
+4. A reduction of both `A` and `B` is performed in block `2`, yielding `(0.,0.)`
+
+The sequential code will therefore *always* yield `(0.,0.)`, and that could be
+considered the *correct* answer as per a *sequential consistency* criterion. We
+can check that this is actually the case by running
+[`DataFlowTasks.force_sequential()`](@ref) before executing the code:
+
+```@example nested-tasks
+DataFlowTasks.force_sequential(true)
+nested_tasks()
+```
+
+If you reactivate `DataFlowTasks` and re-run the code above a few times,
+however, you will notice that summing `B` will always give `0`, but summing `A`
+will not
+
+```@example nested-tasks
+DataFlowTasks.force_sequential(false)
+map(i -> nested_tasks(), 1:10)
+```
+
+The reason is that while we are guaranteed that task `2` will be created *after*
+task `1`, we don't have much control on when tasks `1a` and `1b` will be created
+relative to task `2`. Because of that, while `2` will always wait on `1` before
+running due to the data conflict, `2` could very well be spawned *before* `1a`
+and/or `1b`, in which case it won't wait for them! The result of `sum(A)`,
+therefore, is not deterministic in our program.
+
+The problem is that if we allow for several paths of execution to `spawn`
+`DataFlowTask`s on the same task graph concurrently, the order upon which these
+tasks are added to the task graph is impossible to control. This makes the
+*direction of dependency* between two tasks `ti` and `tj` with conflicting data
+accesses undetermined: we will infer that `ti` depends on `tj` if `ti` is created
+first, and that `tj` depends on `ti` if `tj` is created first.
+
+One way to resolve this ambiguity in the example above is to modify function to
+avoid nested tasks. For this admittedly contrived example, we could have written
+instead:
+
+```@example nested-tasks
+function linear_tasks()
+    A,B = ones(10), ones(10)
+    sleep(0.1)
+    @dspawn begin
+        @RW(view(A,1:5)) .= 0
+    end label = "1a"
+    @dspawn begin
+        @RW(view(A,6:10)) .= 0 
+    end label = "1b"
+    @dspawn @W(B) .= 0 label = "1"
+    res = @dspawn begin
+        (sum(@R(A)),sum(@R(B))) 
+    end label = "2"
+    fetch(res)
+end
+@show linear_tasks()
+```
+
+You can check that the code above will consistently yield `(0.,0.)` as a result.
+
+When nesting tasks is unavoidable, or when the performance hit from *flattening
+out* our nested algorithm is too large, a more advanced option is to create a
+*separate task graph* for each level of nesting. That way we manually handle the
+logic, and recover a predictable order in each task graph:
+
+```@example nested-tasks
+using DataFlowTasks: TaskGraph, with_taskgraph
+
+function nested_taskgraphs()
+    A,B = ones(10), ones(10)
+    @dspawn begin
+        sleep(0.1)
+        @RW A B
+        tg = TaskGraph() # a new taskgraph
+        with_taskgraph(tg) do
+            @dspawn begin
+                @RW(view(A,1:5)) .= 0
+            end label = "1a"
+            @dspawn begin
+                @RW(view(A,6:10)) .= 0 
+            end label = "1b"
+        end
+        wait(tg)
+        B .= 0
+    end label = "1"
+    res = @dspawn begin
+        (sum(@R(A)),sum(@R(B))) 
+    end label = "2"
+    fetch(res)
+end
+nested_taskgraphs()
+```
+
+In this last solution, there are two task graphs: the *outer* one containing
+tasks `1` and `2`, and an *inner* one, created by task `1`, which spawns tasks
+`1a` and `1b`. The inner task graph is waited on by task `1`, so that task `2`
+will only start after both `1a` and `1b` have completed. Note that because a new
+task graph is created for `1a` and `1b`, they will never depend on task `2`,
+which could create a deadlock!
+
+In the future we may provide a more convenient syntax for creating nested task;
+at present, the suggestion is to avoid them if possible.