Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add file for parallel execution #290

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 58 additions & 29 deletions brian2cuda/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,6 @@ def __init__(self):
self.include_dirs.remove('brianlib/randomkit')
self.library_dirs.remove('brianlib/randomkit')

# Add code line slots used in our benchmarks
# TODO: Add to brian2 and remove here
self.code_lines.update({'before_network_run': [],
'after_network_run': []})

### Attributes specific to CUDAStandaloneDevice:
# only true during first run call (relevant for synaptic pre/post ID deletion)
self.first_run = True
Expand All @@ -94,6 +89,8 @@ def __init__(self):
# list of pre/post ID arrays that are not needed in device memory
self.delete_synaptic_pre = {}
self.delete_synaptic_post = {}
# dictionary to store parallalelization information
self.stream_info = {}
# The following nested dictionary collects all codeobjects that use random
# number generation (RNG).
self.codeobjects_with_rng = {
Expand Down Expand Up @@ -351,18 +348,6 @@ def code_object(self, owner, name, abstract_code, variables, template_name,
f"Using atomics in synaptic effect application of Synapses object "
f"{name}"
)
threads_expr = prefs.devices.cuda_standalone.threads_per_synapse_bundle
pathway_name = template_kwds['pathway'].name
replace_expr = {
'{mean}': f'{pathway_name}_bundle_size_mean',
'{std}': f'{pathway_name}_bundle_size_std',
'{min}': f'{pathway_name}_bundle_size_min',
'{max}': f'{pathway_name}_bundle_size_max',
}
for old, new in replace_expr.items():
threads_expr = threads_expr.replace(old, new)
template_kwds["threads_per_synapse_bundle"] = threads_expr
template_kwds["bundle_threads_warp_multiple"] = prefs.devices.cuda_standalone.bundle_threads_warp_multiple
if template_name in ["synapses_create_generator", "synapses_create_array"]:
if owner.multisynaptic_index is not None:
template_kwds["multisynaptic_idx_var"] = owner.variables[owner.multisynaptic_index]
Expand All @@ -376,6 +361,7 @@ def code_object(self, owner, name, abstract_code, variables, template_name,
template_kwds["sm_multiplier"] = prefs["devices.cuda_standalone.SM_multiplier"]
template_kwds["syn_launch_bounds"] = prefs["devices.cuda_standalone.syn_launch_bounds"]
template_kwds["calc_occupancy"] = prefs["devices.cuda_standalone.calc_occupancy"]
template_kwds["stream_info"] = self.stream_info
if template_name in ["threshold", "spikegenerator"]:
template_kwds["extra_threshold_kernel"] = prefs["devices.cuda_standalone.extra_threshold_kernel"]
codeobj = super(CUDAStandaloneDevice, self).code_object(owner, name, abstract_code, variables,
Expand All @@ -391,7 +377,7 @@ def check_openmp_compatible(self, nb_threads):
if nb_threads > 0:
raise NotImplementedError("Using OpenMP in a CUDA standalone project is not supported")

def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks):
def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks, stream_info):
sm_multiplier = prefs.devices.cuda_standalone.SM_multiplier
num_parallel_blocks = prefs.devices.cuda_standalone.parallel_blocks
curand_generator_type = prefs.devices.cuda_standalone.random_number_generator_type
Expand All @@ -410,6 +396,9 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_
for syn in synapses:
if syn.multisynaptic_index is not None:
multisyn_vars.append(syn.variables[syn.multisynaptic_index])
# get number of unique streams

num_stream = max(Counter(stream_info).values())
arr_tmp = self.code_object_class().templater.objects(
None, None,
array_specs=self.arrays,
Expand All @@ -433,7 +422,8 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_
spikegenerator_eventspaces=self.spikegenerator_eventspaces,
multisynaptic_idx_vars=multisyn_vars,
profiled_codeobjects=self.profiled_codeobjects,
profile_statemonitor_copy_to_host=prefs.devices.cuda_standalone.profile_statemonitor_copy_to_host)
parallelize=True,
stream_size=num_stream)
# Reinsert deleted entries, in case we use self.arrays later? maybe unnecassary...
self.arrays.update(self.eventspace_arrays)
writer.write('objects.*', arr_tmp)
Expand Down Expand Up @@ -463,7 +453,8 @@ def generate_main_source(self, writer):
# For codeobjects run every tick, this happens in the init() of
# the random number buffer called at first clock cycle of the network
main_lines.append('random_number_buffer.ensure_enough_curand_states();')
main_lines.append(f'_run_{codeobj.name}();')
# add stream - default
main_lines.append(f'_run_{codeobj.name}(0);')
elif func == 'after_run_code_object':
codeobj, = args
main_lines.append(f'_after_run_{codeobj.name}();')
Expand Down Expand Up @@ -944,13 +935,13 @@ def _replace_constants_and_parameters(code):
sub = 't - lastupdate'
if sub in code:
code = code.replace(sub, f'float({sub})')
logger.debug(f"Replaced {sub} with float({sub}) in {codeobj.name}")
logger.debug(f"Replaced {sub} with float({sub}) in {codeobj}")
# replace double-precision floating-point literals with their
# single-precision version (e.g. `1.0` -> `1.0f`)
code = replace_floating_point_literals(code)
logger.debug(
f"Replaced floating point literals by single precision version "
f"(appending `f`) in {codeobj.name}."
f"(appending `f`) in {codeobj}."
)

writer.write('code_objects/'+codeobj.name+'.cu', code)
Expand Down Expand Up @@ -1004,10 +995,14 @@ def generate_network_source(self, writer):
maximum_run_time = self._maximum_run_time
if maximum_run_time is not None:
maximum_run_time = float(maximum_run_time)
num_stream = max(Counter(self.stream_info).values())
network_tmp = self.code_object_class().templater.network(None, None,
maximum_run_time=maximum_run_time,
eventspace_arrays=self.eventspace_arrays,
spikegenerator_eventspaces=self.spikegenerator_eventspaces)
spikegenerator_eventspaces=self.spikegenerator_eventspaces,
parallelize = True,
stream_info = self.stream_info,
num_stream= num_stream)
writer.write('network.*', network_tmp)

def generate_synapses_classes_source(self, writer):
Expand Down Expand Up @@ -1328,7 +1323,7 @@ def build(self, directory='output',

self.generate_objects_source(self.writer, self.arange_arrays,
net_synapses, self.static_array_specs,
self.networks)
self.networks, self.stream_info)
self.generate_network_source(self.writer)
self.generate_synapses_classes_source(self.writer)
self.generate_run_source(self.writer)
Expand All @@ -1355,6 +1350,11 @@ def build(self, directory='output',
if "devices.cuda_standalone" in pref_name:
logger.info(f"\t{pref_name} = {prefs[pref_name]}")

logger.debug("Using the following brian preferences:")
for pref_name in prefs:
if pref_name not in prefs:
logger.debug(f"\t{pref_name} = {prefs[pref_name]}")

if compile:
self.compile_source(directory, cpp_compiler, debug, clean)
if run:
Expand Down Expand Up @@ -1395,6 +1395,25 @@ def network_run(self, net, duration, report=None, report_period=10*second,
self.clocks.update(net._clocks)
net.t_ = float(t_end)


# Create dictionary for parallelisation with stream
streams_organization = defaultdict(list)
for obj in net.sorted_objects:
streams_organization[(obj.when, obj.order)].append(obj)

# associate each code object with a particular stream
streams_details = defaultdict(list)
count = 1
for key in streams_organization:
for object in streams_organization[key]:
streams_details[object.name] = count
count +=1

self.stream_info = streams_details
self.stream_info['default'] = 0



# TODO: remove this horrible hack
for clock in self.clocks:
if clock.name=='clock':
Expand Down Expand Up @@ -1529,11 +1548,21 @@ def network_run(self, net, duration, report=None, report_period=10*second,

# create all random numbers needed for the next clock cycle
for clock in net._clocks:
run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer);')
run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer, {self.stream_info["default"]});')

all_clocks = set()
# TODO add for every code object -> add where in the list are there.
# TODO create new dic (code object, position in list)
for clock, codeobj in code_objects:
run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name});')
# add this position as additional number here
# check if codeobj.name has _codeobject in it
name = codeobj.name
if "_codeobject" in codeobj.name:
name = codeobj.name[:-11]
if name in self.stream_info.keys():
run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name}, {self.stream_info[name]});')
else:
run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name}, {self.stream_info["default"]});')
all_clocks.add(clock)

# Under some rare circumstances (e.g. a NeuronGroup only defining a
Expand All @@ -1558,12 +1587,12 @@ def network_run(self, net, duration, report=None, report_period=10*second,
# release version? (TODO: add via insert_code mechanism)
run_lines.append('CUDA_SAFE_CALL(cudaProfilerStart());')

run_lines.extend(self.code_lines['before_network_run'])
run_lines.extend(self.code_lines['before_run'])
# run everything that is run on a clock
run_lines.append(
f'{net.name}.run({float(duration)!r}, {report_call}, {float(report_period)!r});'
)
run_lines.extend(self.code_lines['after_network_run'])
run_lines.extend(self.code_lines['after_run'])
# for multiple runs, the random number buffer needs to be reset
run_lines.append('random_number_buffer.run_finished();')
# nvprof stuff
Expand Down Expand Up @@ -1840,4 +1869,4 @@ def prepare_codeobj_code_for_rng(codeobj):

cuda_standalone_device = CUDAStandaloneDevice()

all_devices['cuda_standalone'] = cuda_standalone_device
all_devices['cuda_standalone'] = cuda_standalone_device
40 changes: 14 additions & 26 deletions brian2cuda/templates/common_group.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ namespace {
(e.g. _host_rand used in _poisson), but we can't put support_code_lines lines
after block random_functions since random_functions can use functions defined in
support_code_lines (e.g. _rand) #}
randomNumber_t _host_rand(const int _vectorisation_idx);
randomNumber_t _host_randn(const int _vectorisation_idx);
double _host_rand(const int _vectorisation_idx);
double _host_randn(const int _vectorisation_idx);
int32_t _host_poisson(double _lambda, const int _vectorisation_idx);

///// block extra_device_helper /////
Expand All @@ -77,13 +77,13 @@ namespace {
{% block random_functions %}
// Implement dummy functions such that the host compiled code of binomial
// functions works. Hacky, hacky ...
randomNumber_t _host_rand(const int _vectorisation_idx)
double _host_rand(const int _vectorisation_idx)
{
printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__,
__LINE__);
exit(EXIT_FAILURE);
}
randomNumber_t _host_randn(const int _vectorisation_idx)
double _host_randn(const int _vectorisation_idx)
{
printf("ERROR: Called dummy function `_host_rand` in %s:%d\n", __FILE__,
__LINE__);
Expand All @@ -109,8 +109,6 @@ __launch_bounds__(1024, {{sm_multiplier}})
_run_kernel_{{codeobj_name}}(
int _N,
int THREADS_PER_BLOCK,
{% block extra_kernel_parameters %}
{% endblock %}
///// KERNEL_PARAMETERS /////
%KERNEL_PARAMETERS%
)
Expand All @@ -119,11 +117,8 @@ _run_kernel_{{codeobj_name}}(

int tid = threadIdx.x;
int bid = blockIdx.x;

{% block indices %}
int _idx = bid * THREADS_PER_BLOCK + tid;
int _vectorisation_idx = _idx;
{% endblock %}

///// KERNEL_CONSTANTS /////
%KERNEL_CONSTANTS%
Expand All @@ -136,10 +131,12 @@ _run_kernel_{{codeobj_name}}(
{% block additional_variables %}
{% endblock %}

if(_vectorisation_idx >= _N)
{% block num_thread_check %}
if(_idx >= _N)
{
return;
}
{% endblock %}

{% block kernel_maincode %}

Expand All @@ -158,7 +155,7 @@ _run_kernel_{{codeobj_name}}(
{% endblock kernel %}


void _run_{{codeobj_name}}()
void _run_{{codeobj_name}}(cudaStream_t stream)
{
using namespace brian;

Expand All @@ -182,9 +179,7 @@ void _run_{{codeobj_name}}()
{% endblock %}

{% block prepare_kernel %}
{% block static_kernel_dimensions %}
static int num_threads, num_blocks;
{% endblock %}
static size_t needed_shared_memory = 0;
static bool first_run = true;
if (first_run)
Expand Down Expand Up @@ -273,11 +268,9 @@ void _run_{{codeobj_name}}()
else
{
printf("INFO _run_kernel_{{codeobj_name}}\n"
{% block kernel_info_num_blocks_str %}
"\t%u blocks\n"
{% endblock %}
"\t%u threads\n"
"\t%i registers per thread\n"
"\t%i registers per block\n"
"\t%i bytes statically-allocated shared memory per block\n"
"\t%i bytes local memory per thread\n"
"\t%i bytes user-allocated constant memory\n"
Expand All @@ -286,10 +279,7 @@ void _run_{{codeobj_name}}()
{% else %}
"",
{% endif %}
{% block kernel_info_num_blocks_var %}
num_blocks,
{% endblock %}
num_threads, funcAttrib.numRegs,
num_blocks, num_threads, funcAttrib.numRegs,
funcAttrib.sharedSizeBytes, funcAttrib.localSizeBytes,
funcAttrib.constSizeBytes{% if calc_occupancy %}, occupancy{% endif %});
}
Expand All @@ -302,11 +292,9 @@ void _run_{{codeobj_name}}()
{% endblock %}

{% block kernel_call %}
_run_kernel_{{codeobj_name}}<<<num_blocks, num_threads>>>(
_run_kernel_{{codeobj_name}}<<<num_blocks, num_threads, 0, stream>>>(
_N,
num_threads,
{% block extra_host_parameters %}
{% endblock %}
///// HOST_PARAMETERS /////
%HOST_PARAMETERS%
);
Expand Down Expand Up @@ -338,7 +326,7 @@ void _run_{{codeobj_name}}()
#ifndef _INCLUDED_{{codeobj_name}}
#define _INCLUDED_{{codeobj_name}}

void _run_{{codeobj_name}}();
void _run_{{codeobj_name}}(cudaStream_t);

{% block extra_functions_h %}
{% endblock %}
Expand Down Expand Up @@ -374,12 +362,12 @@ void _after_run_{{codeobj_name}}()
}
{% endmacro %}


// {{codeobj_name}}
{% macro after_run_h_file() %}
#ifndef _INCLUDED_{{codeobj_name}}_after
#define _INCLUDED_{{codeobj_name}}_affer

void _after_run_{{codeobj_name}}();

#endif
{% endmacro %}
{% endmacro %}
4 changes: 2 additions & 2 deletions brian2cuda/templates/makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ OBJS := ${OBJS:.cpp=.o}
OBJS := ${OBJS:.c=.o}
NVCC = @{{ nvcc_path }} -ccbin $(CXX)
NVCCFLAGS = -I. -std=c++11 {{gpu_arch_flags}} {{nvcc_compiler_flags}} {{compiler_debug_flags}} -Xcompiler "{{cpp_compiler_flags}}"
LFLAGS = -lcurand -I. {{gpu_arch_flags}} {{cpp_linker_flags}} {{linker_debug_flags}}
LFLAGS = -lcurand -lcudart -I. {{gpu_arch_flags}} {{cpp_linker_flags}} {{linker_debug_flags}}

all: $(PROGRAM)

Expand All @@ -20,4 +20,4 @@ clean:
{{rm_cmd}}

%.o : %.cu
$(NVCC) $(NVCCFLAGS) -dc $< -o $@
$(NVCC) $(NVCCFLAGS) -dc $< -o $@
Loading