brian-team · SudeshnaBora · May 31, 2022
diff --git a/brian2cuda/device.py b/brian2cuda/device.py
@@ -78,11 +78,6 @@ def __init__(self):
         self.include_dirs.remove('brianlib/randomkit')
         self.library_dirs.remove('brianlib/randomkit')
 
-        # Add code line slots used in our benchmarks
-        # TODO: Add to brian2 and remove here
-        self.code_lines.update({'before_network_run': [],
-                                'after_network_run': []})
-
         ### Attributes specific to CUDAStandaloneDevice:
         # only true during first run call (relevant for synaptic pre/post ID deletion)
         self.first_run = True
@@ -94,6 +89,8 @@ def __init__(self):
         # list of pre/post ID arrays that are not needed in device memory
         self.delete_synaptic_pre = {}
         self.delete_synaptic_post = {}
+        # dictionary to store parallalelization information
+        self.stream_info = {}
         # The following nested dictionary collects all codeobjects that use random
         # number generation (RNG).
         self.codeobjects_with_rng = {
@@ -351,18 +348,6 @@ def code_object(self, owner, name, abstract_code, variables, template_name,
                     f"Using atomics in synaptic effect application of Synapses object "
                     f"{name}"
                 )
-            threads_expr = prefs.devices.cuda_standalone.threads_per_synapse_bundle
-            pathway_name = template_kwds['pathway'].name
-            replace_expr = {
-                '{mean}': f'{pathway_name}_bundle_size_mean',
-                '{std}': f'{pathway_name}_bundle_size_std',
-                '{min}': f'{pathway_name}_bundle_size_min',
-                '{max}': f'{pathway_name}_bundle_size_max',
-            }
-            for old, new in replace_expr.items():
-                threads_expr = threads_expr.replace(old, new)
-            template_kwds["threads_per_synapse_bundle"] = threads_expr
-            template_kwds["bundle_threads_warp_multiple"] = prefs.devices.cuda_standalone.bundle_threads_warp_multiple
         if template_name in ["synapses_create_generator", "synapses_create_array"]:
             if owner.multisynaptic_index is not None:
                 template_kwds["multisynaptic_idx_var"] = owner.variables[owner.multisynaptic_index]
@@ -376,6 +361,7 @@ def code_object(self, owner, name, abstract_code, variables, template_name,
         template_kwds["sm_multiplier"] = prefs["devices.cuda_standalone.SM_multiplier"]
         template_kwds["syn_launch_bounds"] = prefs["devices.cuda_standalone.syn_launch_bounds"]
         template_kwds["calc_occupancy"] = prefs["devices.cuda_standalone.calc_occupancy"]
+        template_kwds["stream_info"] = self.stream_info
         if template_name in ["threshold", "spikegenerator"]:
             template_kwds["extra_threshold_kernel"] = prefs["devices.cuda_standalone.extra_threshold_kernel"]
         codeobj = super(CUDAStandaloneDevice, self).code_object(owner, name, abstract_code, variables,
@@ -391,7 +377,7 @@ def check_openmp_compatible(self, nb_threads):
         if nb_threads > 0:
             raise NotImplementedError("Using OpenMP in a CUDA standalone project is not supported")
 
-    def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks):
+    def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks, stream_info):
         sm_multiplier = prefs.devices.cuda_standalone.SM_multiplier
         num_parallel_blocks = prefs.devices.cuda_standalone.parallel_blocks
         curand_generator_type = prefs.devices.cuda_standalone.random_number_generator_type
@@ -410,6 +396,9 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_
         for syn in synapses:
             if syn.multisynaptic_index is not None:
                 multisyn_vars.append(syn.variables[syn.multisynaptic_index])
+        # get number of unique streams
+
+        num_stream = max(Counter(stream_info).values())
         arr_tmp = self.code_object_class().templater.objects(
                         None, None,
                         array_specs=self.arrays,
@@ -433,7 +422,8 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_
                         spikegenerator_eventspaces=self.spikegenerator_eventspaces,
                         multisynaptic_idx_vars=multisyn_vars,
                         profiled_codeobjects=self.profiled_codeobjects,
-                        profile_statemonitor_copy_to_host=prefs.devices.cuda_standalone.profile_statemonitor_copy_to_host)
+                        parallelize=True,
+                        stream_size=num_stream)
         # Reinsert deleted entries, in case we use self.arrays later? maybe unnecassary...
         self.arrays.update(self.eventspace_arrays)
         writer.write('objects.*', arr_tmp)
@@ -463,7 +453,8 @@ def generate_main_source(self, writer):
                         # For codeobjects run every tick, this happens in the init() of
                         # the random number buffer called at first clock cycle of the network
                         main_lines.append('random_number_buffer.ensure_enough_curand_states();')
-                main_lines.append(f'_run_{codeobj.name}();')
+                # add stream - default
+                main_lines.append(f'_run_{codeobj.name}(0);')
             elif func == 'after_run_code_object':
                 codeobj, = args
                 main_lines.append(f'_after_run_{codeobj.name}();')
@@ -944,13 +935,13 @@ def _replace_constants_and_parameters(code):
                 sub = 't - lastupdate'
                 if sub in code:
                     code = code.replace(sub, f'float({sub})')
-                    logger.debug(f"Replaced {sub} with float({sub}) in {codeobj.name}")
+                    logger.debug(f"Replaced {sub} with float({sub}) in {codeobj}")
                 # replace double-precision floating-point literals with their
                 # single-precision version (e.g. `1.0` -> `1.0f`)
                 code = replace_floating_point_literals(code)
                 logger.debug(
                     f"Replaced floating point literals by single precision version "
-                    f"(appending `f`) in {codeobj.name}."
+                    f"(appending `f`) in {codeobj}."
                 )
 
             writer.write('code_objects/'+codeobj.name+'.cu', code)
@@ -1004,10 +995,14 @@ def generate_network_source(self, writer):
         maximum_run_time = self._maximum_run_time
         if maximum_run_time is not None:
             maximum_run_time = float(maximum_run_time)
+        num_stream = max(Counter(self.stream_info).values())
         network_tmp = self.code_object_class().templater.network(None, None,
                                                                  maximum_run_time=maximum_run_time,
                                                                  eventspace_arrays=self.eventspace_arrays,
-                                                                 spikegenerator_eventspaces=self.spikegenerator_eventspaces)
+                                                                 spikegenerator_eventspaces=self.spikegenerator_eventspaces,
+                                                                 parallelize = True,
+                                                                 stream_info = self.stream_info,
+                                                                 num_stream= num_stream)
         writer.write('network.*', network_tmp)
 
     def generate_synapses_classes_source(self, writer):
@@ -1328,7 +1323,7 @@ def build(self, directory='output',
 
         self.generate_objects_source(self.writer, self.arange_arrays,
                                      net_synapses, self.static_array_specs,
-                                     self.networks)
+                                     self.networks, self.stream_info)
         self.generate_network_source(self.writer)
         self.generate_synapses_classes_source(self.writer)
         self.generate_run_source(self.writer)
@@ -1355,6 +1350,11 @@ def build(self, directory='output',
             if "devices.cuda_standalone" in pref_name:
                 logger.info(f"\t{pref_name} = {prefs[pref_name]}")
 
+        logger.debug("Using the following brian preferences:")
+        for pref_name in prefs:
+            if pref_name not in prefs:
+                logger.debug(f"\t{pref_name} = {prefs[pref_name]}")
+
         if compile:
             self.compile_source(directory, cpp_compiler, debug, clean)
             if run:
@@ -1395,6 +1395,25 @@ def network_run(self, net, duration, report=None, report_period=10*second,
         self.clocks.update(net._clocks)
         net.t_ = float(t_end)
 
+
+        # Create dictionary for parallelisation with stream
+        streams_organization = defaultdict(list)
+        for obj in net.sorted_objects:
+            streams_organization[(obj.when, obj.order)].append(obj)
+
+        # associate each code object with a particular stream
+        streams_details = defaultdict(list)
+        count = 1
+        for key in streams_organization:
+            for object in streams_organization[key]:
+                streams_details[object.name] = count
+            count +=1
+
+        self.stream_info = streams_details
+        self.stream_info['default'] = 0
+
+
+
         # TODO: remove this horrible hack
         for clock in self.clocks:
             if clock.name=='clock':
@@ -1529,11 +1548,21 @@ def network_run(self, net, duration, report=None, report_period=10*second,
 
         # create all random numbers needed for the next clock cycle
         for clock in net._clocks:
-            run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer);')
+            run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer, {self.stream_info["default"]});')
 
         all_clocks = set()
+        # TODO add for every code object ->  add where in the list are there.
+        # TODO create new dic (code object, position in list)
         for clock, codeobj in code_objects:
-            run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name});')
+            # add this position as additional number here
+            # check if codeobj.name has _codeobject in it
+            name = codeobj.name
+            if "_codeobject" in codeobj.name:
+                name = codeobj.name[:-11]
+            if name in self.stream_info.keys():
+                run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name}, {self.stream_info[name]});')
+            else:
+                run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name}, {self.stream_info["default"]});')
             all_clocks.add(clock)
 
         # Under some rare circumstances (e.g. a NeuronGroup only defining a
@@ -1558,12 +1587,12 @@ def network_run(self, net, duration, report=None, report_period=10*second,
         # release version? (TODO: add via insert_code mechanism)
         run_lines.append('CUDA_SAFE_CALL(cudaProfilerStart());')
 
-        run_lines.extend(self.code_lines['before_network_run'])
+        run_lines.extend(self.code_lines['before_run'])
         # run everything that is run on a clock
         run_lines.append(
             f'{net.name}.run({float(duration)!r}, {report_call}, {float(report_period)!r});'
         )
-        run_lines.extend(self.code_lines['after_network_run'])
+        run_lines.extend(self.code_lines['after_run'])
         # for multiple runs, the random number buffer needs to be reset
         run_lines.append('random_number_buffer.run_finished();')
         # nvprof stuff
@@ -1840,4 +1869,4 @@ def prepare_codeobj_code_for_rng(codeobj):
 
 cuda_standalone_device = CUDAStandaloneDevice()
 
-all_devices['cuda_standalone'] = cuda_standalone_device
+all_devices['cuda_standalone'] = cuda_standalone_device
diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu
@@ -63,8 +63,8 @@ namespace {
        (e.g. _host_rand used in _poisson), but we can't put support_code_lines lines
        after block random_functions since random_functions can use functions defined in
        support_code_lines (e.g. _rand) #}
-    randomNumber_t _host_rand(const int _vectorisation_idx);
-    randomNumber_t _host_randn(const int _vectorisation_idx);
+    double _host_rand(const int _vectorisation_idx);
+    double _host_randn(const int _vectorisation_idx);
     int32_t _host_poisson(double _lambda, const int _vectorisation_idx);
 
     ///// block extra_device_helper /////
@@ -77,13 +77,13 @@ namespace {
     {% block random_functions %}
     // Implement dummy functions such that the host compiled code of binomial
     // functions works. Hacky, hacky ...
-    randomNumber_t _host_rand(const int _vectorisation_idx)
+    double _host_rand(const int _vectorisation_idx)
     {
         printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__,
                 __LINE__);
         exit(EXIT_FAILURE);
     }
-    randomNumber_t _host_randn(const int _vectorisation_idx)
+    double _host_randn(const int _vectorisation_idx)
     {
         printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__,
                 __LINE__);
@@ -109,8 +109,6 @@ __launch_bounds__(1024, {{sm_multiplier}})
 _run_kernel_{{codeobj_name}}(
     int _N,
     int THREADS_PER_BLOCK,
-    {% block extra_kernel_parameters %}
-    {% endblock %}
     ///// KERNEL_PARAMETERS /////
     %KERNEL_PARAMETERS%
     )
@@ -119,11 +117,8 @@ _run_kernel_{{codeobj_name}}(
 
     int tid = threadIdx.x;
     int bid = blockIdx.x;
-
-    {% block indices %}
     int _idx = bid * THREADS_PER_BLOCK + tid;
     int _vectorisation_idx = _idx;
-    {% endblock %}
 
     ///// KERNEL_CONSTANTS /////
     %KERNEL_CONSTANTS%
@@ -136,10 +131,12 @@ _run_kernel_{{codeobj_name}}(
     {% block additional_variables %}
     {% endblock %}
 
-    if(_vectorisation_idx >= _N)
+    {% block num_thread_check %}
+    if(_idx >= _N)
     {
         return;
     }
+    {% endblock %}
 
     {% block kernel_maincode %}
 
@@ -158,7 +155,7 @@ _run_kernel_{{codeobj_name}}(
 {% endblock kernel %}
 
 
-void _run_{{codeobj_name}}()
+void _run_{{codeobj_name}}(cudaStream_t stream)
 {
     using namespace brian;
 
@@ -182,9 +179,7 @@ void _run_{{codeobj_name}}()
     {% endblock %}
 
     {% block prepare_kernel %}
-    {% block static_kernel_dimensions %}
     static int num_threads, num_blocks;
-    {% endblock %}
     static size_t needed_shared_memory = 0;
     static bool first_run = true;
     if (first_run)
@@ -273,11 +268,9 @@ void _run_{{codeobj_name}}()
         else
         {
             printf("INFO _run_kernel_{{codeobj_name}}\n"
-                   {% block kernel_info_num_blocks_str %}
                    "\t%u blocks\n"
-                   {% endblock %}
                    "\t%u threads\n"
-                   "\t%i registers per thread\n"
+                   "\t%i registers per block\n"
                    "\t%i bytes statically-allocated shared memory per block\n"
                    "\t%i bytes local memory per thread\n"
                    "\t%i bytes user-allocated constant memory\n"
@@ -286,10 +279,7 @@ void _run_{{codeobj_name}}()
                    {% else %}
                    "",
                    {% endif %}
-                   {% block kernel_info_num_blocks_var %}
-                   num_blocks,
-                   {% endblock %}
-                   num_threads, funcAttrib.numRegs,
+                   num_blocks, num_threads, funcAttrib.numRegs,
                    funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
                    funcAttrib.constSizeBytes{% if calc_occupancy %}, occupancy{% endif %});
         }
@@ -302,11 +292,9 @@ void _run_{{codeobj_name}}()
     {% endblock %}
 
     {% block kernel_call %}
-    _run_kernel_{{codeobj_name}}<<<num_blocks, num_threads>>>(
+    _run_kernel_{{codeobj_name}}<<<num_blocks, num_threads, 0, stream>>>(
             _N,
             num_threads,
-            {% block extra_host_parameters %}
-            {% endblock %}
             ///// HOST_PARAMETERS /////
             %HOST_PARAMETERS%
         );
@@ -338,7 +326,7 @@ void _run_{{codeobj_name}}()
 #ifndef _INCLUDED_{{codeobj_name}}
 #define _INCLUDED_{{codeobj_name}}
 
-void _run_{{codeobj_name}}();
+void _run_{{codeobj_name}}(cudaStream_t);
 
 {% block extra_functions_h %}
 {% endblock %}
@@ -374,12 +362,12 @@ void _after_run_{{codeobj_name}}()
 }
 {% endmacro %}
 
-
+// {{codeobj_name}}
 {% macro after_run_h_file() %}
 #ifndef _INCLUDED_{{codeobj_name}}_after
 #define _INCLUDED_{{codeobj_name}}_affer
 
 void _after_run_{{codeobj_name}}();
 
 #endif
-{% endmacro %}
+{% endmacro %}
diff --git a/brian2cuda/templates/makefile b/brian2cuda/templates/makefile
@@ -7,7 +7,7 @@ OBJS := ${OBJS:.cpp=.o}
 OBJS := ${OBJS:.c=.o}
 NVCC = @{{ nvcc_path }} -ccbin $(CXX)
 NVCCFLAGS = -I. -std=c++11 {{gpu_arch_flags}} {{nvcc_compiler_flags}} {{compiler_debug_flags}} -Xcompiler "{{cpp_compiler_flags}}"
-LFLAGS = -lcurand -I. {{gpu_arch_flags}} {{cpp_linker_flags}} {{linker_debug_flags}}
+LFLAGS = -lcurand -lcudart -I. {{gpu_arch_flags}} {{cpp_linker_flags}} {{linker_debug_flags}}
 
 all: $(PROGRAM)
 
@@ -20,4 +20,4 @@ clean:
 	{{rm_cmd}}
 
 %.o : %.cu
-	$(NVCC) $(NVCCFLAGS) -dc $< -o $@
+	$(NVCC) $(NVCCFLAGS) -dc $< -o $@