Source code for

from __future__ import print_function

import logging
import itertools
import argparse
import pickle

from tvb.simulator.lab import *
from tvb.basic.logger.builder import get_logger

from import regularRun

import os.path
import numpy as np
	import pycuda.autoinit
	import pycuda.driver as drv
	from pycuda.compiler import SourceModule
	import pycuda.gpuarray as gpuarray
except ImportError:
	logging.warning('pycuda not available, rateML driver not usable.')

import matplotlib.pyplot as plt

import time
import tqdm

from datetime import datetime

here = os.path.dirname(os.path.abspath(__file__))

[docs] class Driver_Setup: def __init__(self): self.args = self.parse_args() self.logger = get_logger('tvb.rateML') self.logger.setLevel(level='INFO' if self.args.verbose else 'WARNING') self.checkargbounds() self.dt = self.args.delta_time self.connectivity = self.tvb_connectivity(self.args.n_regions) self.weights = self.connectivity.weights self.lengths = self.connectivity.tract_lengths self.tavg_period = 1.0 # Setting TP for bold self.TR = (1.8/self.dt) self.n_inner_steps = int(self.tavg_period / self.dt) self.params = self.setup_params( self.args.n_sweep_arg0, self.args.n_sweep_arg1, ) # bufferlength is based on the minimum of the first swept parameter (speed for many tvb models) self.n_work_items, self.n_params = self.params.shape self.buf_len_ = ((self.lengths / self.args.speeds_min / self.dt).astype('i').max() + 1) self.buf_len = 2 ** np.argwhere(2 ** np.r_[:30] > self.buf_len_)[0][0] # use next power of self.states = self.args.states self.exposures = self.args.exposures if self.args.gpu_info: self.logger.setLevel(level='INFO') self.gpu_device_info() exit(1) self.logdata()
[docs] def logdata(self):'dt %f', self.dt)'n_nodes %d', self.args.n_regions)'weights.shape %s', self.weights.shape)'lengths.shape %s', self.lengths.shape)'tavg period %s', self.tavg_period)'n_inner_steps %s', self.n_inner_steps)'params shape %s', self.params.shape)'nstep %d', self.args.n_time)'n_inner_steps %f', self.n_inner_steps) #'single connectome, %d x %d parameter space', self.args.n_sweep_arg0, self.args.n_sweep_arg1)'real buf_len %d, using power of 2 %d', self.buf_len_, self.buf_len)'number of states %d', self.states)'model %s', self.args.model)'real buf_len %d, using power of 2 %d', self.buf_len_, self.buf_len)'memory for states array on GPU %d MiB', (self.buf_len * self.n_work_items * self.states * self.args.n_regions * 4) / 1024 ** 2)
[docs] def checkargbounds(self): try: assert self.args.n_sweep_arg0 > 0, "Min value for [N_SWEEP_ARG0] is 1" assert self.args.n_time > 0, "Minimum number for [-n N_TIME] is 1" assert self.args.n_regions > 0, "Min value for [-tvbn n_regions] for default data set is 68" assert self.args.blockszx > 0 and self.args.blockszx <= 32, "Bounds for [-bx BLOCKSZX] are 0 < value <= 32" assert self.args.blockszy > 0 and self.args.blockszy <= 32, "Bounds for [-by BLOCKSZY] are 0 < value <= 32" assert self.args.delta_time > 0.0, "Min value for [-dt delta_time] is > 0.0, default is 0.1" assert self.args.speeds_min > 0.0, "Min value for [-sm speeds_min] is > 0.0, default is 3e-3" assert self.args.exposures > 0, "Min value for [-x exposures] is 1" assert self.args.states > 0, "Min value for [-s states] is 1" except AssertionError as e: self.logger.error('%s', e) raise
[docs] def tvb_connectivity(self, tvbnodes): white_matter = connectivity.Connectivity.from_file(source_file="connectivity_"+str(tvbnodes)+".zip") # white_matter = connectivity.Connectivity.from_file(source_file="") white_matter.configure() return white_matter
[docs] def parse_args(self): # {{{ parser = argparse.ArgumentParser(description='Run parameter sweep.') # for every parameter that needs to be swept, the size can be set parser.add_argument('-s0', '--n_sweep_arg0', default=4, help='num grid points for 1st parameter', type=int) parser.add_argument('-s1', '--n_sweep_arg1', default=4, help='num grid points for 2st parameter', type=int) parser.add_argument('-n', '--n_time', default=400, help='number of time steps to do', type=int) parser.add_argument('-v', '--verbose', default=False, help='increase logging verbosity', action='store_true') parser.add_argument('-m', '--model', default='montbrio_heun', help="neural mass model to be used during the simulation") parser.add_argument('-s', '--states', default=2, type=int, help="number of states for model") parser.add_argument('-x', '--exposures', default=1, type=int, help="number of exposures for model") parser.add_argument('-l', '--lineinfo', default=False, help='generate line-number information for device code.', action='store_true') parser.add_argument('-bx', '--blockszx', default=8, type=int, help="gpu block size x") parser.add_argument('-by', '--blockszy', default=8, type=int, help="gpu block size y") parser.add_argument('-val', '--validate', default=False, help="enable validation with refmodels", action='store_true') parser.add_argument('-r', '--n_regions', default="68", type=int, help="number of tvb nodes") parser.add_argument('-p', '--plot_data', type=int, help="plot res data for selected state") parser.add_argument('-w', '--write_data', default=False, help="write output data to file: 'tavg_data", action='store_true') parser.add_argument('-g', '--gpu_info', default=False, help="show gpu info", action='store_true') parser.add_argument('-dt', '--delta_time', default=0.1, type=float, help="dt for simulation") parser.add_argument('-sm', '--speeds_min', default=1, type=float, help="min speed for temporal buffer") args = parser.parse_args() return args
[docs] def setup_params(self, n0, n1, ): ''' This code generates the parameters ranges that need to be set ''' sweeparam0 = np.linspace(.5, 1.5, n0) sweeparam1 = np.linspace(1, 10, n1) params = itertools.product( sweeparam0, sweeparam1, ) params = np.array([vals for vals in params], np.float32) return params
[docs] def gpu_device_info(self): ''' Get GPU device information TODO use this information to give user GPU setting suggestions ''' dev = drv.Device(0) print('\n')'GPU = %s','TOTAL AVAIL MEMORY: %d MiB', dev.total_memory()/1024/1024) # get device information att = {'MAX_THREADS_PER_BLOCK': [], 'MAX_BLOCK_DIM_X': [], 'MAX_BLOCK_DIM_Y': [], 'MAX_BLOCK_DIM_Z': [], 'MAX_GRID_DIM_X': [], 'MAX_GRID_DIM_Y': [], 'MAX_GRID_DIM_Z': [], 'TOTAL_CONSTANT_MEMORY': [], 'WARP_SIZE': [], # 'MAX_PITCH': [], 'CLOCK_RATE': [], 'TEXTURE_ALIGNMENT': [], # 'GPU_OVERLAP': [], 'MULTIPROCESSOR_COUNT': [], 'SHARED_MEMORY_PER_BLOCK': [], 'MAX_SHARED_MEMORY_PER_BLOCK': [], 'REGISTERS_PER_BLOCK': [], 'MAX_REGISTERS_PER_BLOCK': []} for key in att: getstring = 'drv.device_attribute.' + key # att[key].append(eval(getstring)) + ': %s', dev.get_attribute(eval(getstring)))
[docs] class Driver_Execute(Driver_Setup): def __init__(self, ds): self.args = ds.args self.set_CUDAmodel_dir() self.weights, self.lengths, self.params = ds.weights, ds.lengths, ds.params self.buf_len, self.states, self.n_work_items = ds.buf_len, ds.states, ds.n_work_items self.n_inner_steps, self.n_params, self.dt = ds.n_inner_steps, ds.n_params, ds.dt self.exposures, self.logger = ds.exposures, ds.logger self.TR = ds.TR
[docs] def set_CUDAmodel_dir(self): self.args.filename = os.path.join((os.path.dirname(os.path.abspath(__file__))), "../generatedModels", self.args.model.lower() + '.c')
[docs] def set_CUDA_ref_model_dir(self): self.args.filename = os.path.join((os.path.dirname(os.path.abspath(__file__))), "../generatedModels/cuda_refs", self.args.model.lower() + '.c')
[docs] def compare_with_ref(self, tavg0): self.args.model = self.args.model + 'ref' self.set_CUDA_ref_model_dir() tavg1 = self.run_simulation() # compare output to check if same as template # comparison = (tavg0.ravel() == tavg1.ravel()) #'Templated version is similar to original %d:', comparison.all())'Corr.coef. of model with %s is: %f', self.args.model, np.corrcoef(tavg0.ravel(), tavg1.ravel())[0, 1]) return np.corrcoef(tavg0.ravel(), tavg1.ravel())[0, 1]
[docs] def make_kernel(self, source_file, warp_size, args, lineinfo=False, nh='nh'): try: with open(source_file, 'r') as fd: source = source = source.replace('M_PI_F', '%ff' % (np.pi, )) # TODO fix for whole words only. Only uppercase is permitted # source = source.replace('pi', '%ff' % (np.pi, )) # source = source.replace('inf', 'INFINITY') opts = ['--ptxas-options=-v', '-maxrregcount=32'] if lineinfo: opts.append('-lineinfo') opts.append('-DWARP_SIZE=%d' % (warp_size, )) opts.append('-DNH=%s' % (nh, )) idirs = [here]'nvcc options %r', opts) try: network_module = SourceModule( source, options=opts, include_dirs=idirs, no_extern_c=True, keep=False,) except drv.CompileError as e: self.logger.error('Compilation failure \n %s', e) exit(1) # generic func signature creation # i = int # j = unsigned int # f = float # Pf = pointer to float # S_ = repeat the former mod_func = "{}{}{}{}".format('_Z', len(args.model), args.model, 'jjjjjfPfS_S_S_S_') # print(mod_func) # mod_func = "_Z12montbrioheunjjjjjfPfS_S_S_S_" # step_fn = network_module.get_function('montbrio_heun') step_fn = network_module.get_function(mod_func) bold_func = "{}".format('_Z11bold_updatejfPfS_S_') bold_fn = network_module.get_function(bold_func) except FileNotFoundError as e: self.logger.error('%s.\n Generated model filename should match model on cmdline', e) exit(1) return step_fn, bold_fn #}}}
[docs] def cf(self, array):#{{{ # coerce possibly mixed-stride, double precision array to C-order single precision return array.astype(dtype='f', order='C', copy=True)#}}}
[docs] def nbytes(self, data):#{{{ # count total bytes used in all data arrays nbytes = 0 for name, array in data.items(): nbytes += array.nbytes return nbytes#}}}
[docs] def make_gpu_data(self, data):#{{{ # put data onto gpu gpu_data = {} for name, array in data.items(): try: gpu_data[name] = gpuarray.to_gpu( except drv.MemoryError as e: self.gpu_mem_info() self.logger.error( '%s.\n\t Please check the parameter dimensions, %d parameters are too large for this GPU', e, self.params.size) exit(1) return gpu_data#}}}
[docs] def release_gpumem(self, gpu_data): for name, array in gpu_data.items(): try: gpu_data[name] except drv.MemoryError as e: self.logger.error('%s.\n\t Freeing mem error', e) exit(1)
[docs] def gpu_mem_info(self): cmd = "nvidia-smi -q -d MEMORY"#,UTILIZATION" os.system(cmd) # returns the exit code in unix
[docs] def run_simulation(self): # setup data#{{{ data = { 'weights': self.weights, 'lengths': self.lengths, 'params': self.params.T } # print('pT', self.params[118][:]) base_shape = self.n_work_items, for name, shape in dict( tavg0=(self.exposures, self.args.n_regions,), tavg1=(self.exposures, self.args.n_regions,), state=(self.buf_len, self.states * self.args.n_regions), bold_state=(4, self.args.n_regions), bold=(self.args.n_regions,), ).items(): # memory error exception for compute device try: data[name] = np.zeros(shape + base_shape, 'f') except MemoryError as e: self.logger.error('%s.\n\t Please check the parameter dimensions %d x %d, they are to large ' 'for this compute device', e, self.args.n_sweep_arg0, self.args.n_sweep_arg1) exit(1) # shape bold_state = (boldvars, regions, workitems) # initialize f, v ,q to 1., to avoid /0 nans data['bold_state'][1:] = 1.0 gpu_data = self.make_gpu_data(data)#{{{ # setup CUDA stuff#{{{ step_fn, bold_fn = self.make_kernel( source_file=self.args.filename, warp_size=32, # block_dim_x=self.args.n_sweep_arg0, # ext_options=preproccesor_defines, # caching=args.caching, args=self.args, lineinfo=self.args.lineinfo, nh=self.buf_len, )#}}} # setup simulation#{{{ tic = time.time() n_streams = 32 streams = [drv.Stream() for i in range(n_streams)] events = [drv.Event() for i in range(n_streams)] tavg_unpinned = [] bold_unpinned = [] try: tavg = drv.pagelocked_zeros((n_streams,) + data['tavg0'].shape, dtype=np.float32) bold = drv.pagelocked_zeros((n_streams,) + data['bold'].shape, dtype=np.float32) except drv.MemoryError as e: self.logger.error( '%s.\n\t Please check the parameter dimensions, %d parameters are too large for this GPU', e, self.params.size) exit(1) # determine optimal grid recursively def dog(fgd): maxgd, mingd = max(fgd), min(fgd) maxpos = fgd.index(max(fgd)) if (maxgd - 1) * mingd * bx * by >= nwi: fgd[maxpos] = fgd[maxpos] - 1 dog(fgd) else: return fgd # n_sweep_arg0 scales griddim.x, n_sweep_arg1 scales griddim.y # form an optimal grid recursively bx, by = self.args.blockszx, self.args.blockszy nwi = self.n_work_items rootnwi = int(np.ceil(np.sqrt(nwi))) gridx = int(np.ceil(rootnwi / bx)) gridy = int(np.ceil(rootnwi / by)) final_block_dim = bx, by, 1 fgd = [gridx, gridy] dog(fgd) final_grid_dim = fgd[0], fgd[1] assert gridx * gridy * bx * by >= nwi'history shape %r', gpu_data['state'].shape)'gpu_data %s', gpu_data['tavg0'].shape)'on device mem: %.3f MiB' % (self.nbytes(data) / 1024 / 1024, ))'final block dim %r', final_block_dim)'final grid dim %r', final_grid_dim) # run simulation#{{{ nstep = self.args.n_time self.gpu_mem_info() if self.args.verbose else None try: for i in tqdm.trange(nstep, file=sys.stdout): try: event = events[i % n_streams] stream = streams[i % n_streams] if i > 0: stream.wait_for_event(events[(i - 1) % n_streams]) step_fn(np.uintc(i * self.n_inner_steps), np.uintc(self.args.n_regions), np.uintc(self.buf_len), np.uintc(self.n_inner_steps), np.uintc(self.n_work_items), np.float32(self.dt), gpu_data['weights'], gpu_data['lengths'], gpu_data['params'], gpu_data['state'], gpu_data['tavg%d' % (i%2,)], block=final_block_dim, grid=final_grid_dim) event.record(streams[i % n_streams]) tavgk = 'tavg%d' % ((i + 1) % 2,) bold_fn(np.uintc(self.args.n_regions), # BOLD model dt is in s, requires 1e-3 np.float32(self.TR * 1e-3), gpu_data['bold_state'], gpu_data[tavgk], gpu_data['bold'], # block=(couplings.size, 1, 1), grid=(speeds.size, 1), stream=stream) # block=(args.n_coupling, 1, 1), grid=(speeds.size, 1), stream=stream) block=final_block_dim, grid=final_grid_dim, stream=stream) except drv.LaunchError as e: self.logger.error('%s', e) exit(1) tavgk = 'tavg%d' % ((i + 1) % 2,) # async wrt. other streams & host, but not this stream. if i >= n_streams: stream.synchronize() tavg_unpinned.append(tavg[i % n_streams].copy()) bold_unpinned.append(bold[i % n_streams].copy()) drv.memcpy_dtoh_async(tavg[i % n_streams], gpu_data[tavgk].ptr, stream=stream) drv.memcpy_dtoh_async(bold[i % n_streams], gpu_data['bold'].ptr, stream=stream) # recover uncopied data from pinned buffer if nstep > n_streams: for i in range(nstep % n_streams, n_streams): stream.synchronize() tavg_unpinned.append(tavg[i].copy()) bold_unpinned.append(bold[i].copy()) for i in range(nstep % n_streams): stream.synchronize() tavg_unpinned.append(tavg[i].copy()) bold_unpinned.append(bold[i].copy()) except drv.LogicError as e: self.logger.error('%s. Check the number of states of the model or ' 'GPU block shape settings blockdim.x/y %r, griddim %r.', e, final_block_dim, final_grid_dim) exit(1) except drv.RuntimeError as e: self.logger.error('%s', e) exit(1) #'kernel finish..') # release pinned memory tavg = np.array(tavg_unpinned) bold = np.array(bold_unpinned) # also release gpu_data self.release_gpumem(gpu_data)'kernel finished') return tavg, bold
[docs] def plot_output(self, tavg): for i in range(0, self.n_work_items): plt.plot((tavg[:, self.args.plot_data, :, i]), 'k', alpha=.2)
[docs] def write_output(self, tavg, bold): tavg_file = open('tavg_gpu_'"%d%m%Y_%H:%M:%S"), 'wb') pickle.dump(tavg, tavg_file) tavg_file.close() bold_file = open('bold_gpu_'"%d%m%Y_%H:%M:%S"), 'wb') pickle.dump(bold, bold_file) bold_file.close()
[docs] def run_all(self): np.random.seed(79) tic = time.time() tavg0, bold = self.run_simulation() decimate = 1 bold = bold[::decimate, :] toc = time.time() elapsed = toc - tic if (self.args.validate == True): self.compare_with_ref(tavg0) self.plot_output(tavg0) if self.args.plot_data is not None else None self.write_output(tavg0, bold) if self.args.write_data else None'Output shape (simsteps, states, bnodes, n_params) %s', tavg0.shape)'Finished CUDA simulation successfully in: {0:.3f}'.format(elapsed))'and in {0:.3f} M step/s'.format( 1e-6 * self.args.n_time * self.n_inner_steps * self.n_work_items/ elapsed)) return tavg0, bold
# return tavg0 if __name__ == '__main__': driver_setup = Driver_Setup() tavgGPU, boldGPU = Driver_Execute(driver_setup).run_all() print('GPU_tavg_shape', tavgGPU.shape) print('GPU_bold_shape', boldGPU.shape)