Dnnweaver2
Published:
Simulator framework for dnn accelerators
Reference
Bit Fusion
- EDDO architecture
- 2D systolic array architecture
- ISA enhance hardware flexibility
- code optimization by loop ordering and loop tiling, which is supported in dnnweaver
Dnnweaver2
Usage
top wrapper
initialize Simulator object and list for results storage and processing
run_ant.py
bf_e_sim = Simulator(config_file, False)
bf_e_sim_sweep_csv = os.path.join(results_dir, 'ant_os.csv')
bf_e_sim_sweep_df = pandas.DataFrame(columns=sim_sweep_columns)
# check_pandas_or_run then call lookup_pandas_dataframe to sweep across benchmarks and collect data
bf_e_results = check_pandas_or_run(bf_e_sim, bf_e_sim_sweep_df, bf_e_sim_sweep_csv, batch_size=batch_size, bench_type='ant')
bf_e_cycles_ant = []
bf_e_energy_ant = []
benchmark configuration(workloads)
sweep.py
class SimulatorSweep(object):
def sweep():
# get_bench_nn_ant call create_net
nn = benchmarks.get_bench_nn_ant(b, batch_size)
if len(results) == 0:
self.logger.info('Simulating Benchmark: {}'.format(b))
self.logger.info('N x M = {} x {}'.format(n, m))
self.logger.info('Max Precision (bits): {}'.format(pmax))
self.logger.info('Min Precision (bits): {}'.format(pmin))
self.logger.info('Batch size: {}'.format(batch_size))
self.logger.info('Bandwidth (bits/cycle): {}'.format(bw))
# get_bench_numbers call sim_obj.get_cycles()
stats = benchmarks.get_bench_numbers(nn, sim_obj, batch_size, weight_stationary = weight_stationary)
for layer in stats:
cycles = stats[layer].total_cycles
reads = stats[layer].reads
writes = stats[layer].writes
stalls = stats[layer].mem_stall_cycles
data_line.append((n,m,pmax,pmin,b,layer,
cycles,stalls,
reads['wgt'],writes['wgt'],
reads['out'],writes['out'],
reads['act'],writes['act'],
reads['dram'],writes['dram'],
sim_obj.accelerator.mem_if_width,
wbuf, obuf, ibuf, batch_size))
consturct graph op object
benchmarks.py
def create_net(net_name, net_list, batch_size):
g = Graph(net_name, dataset='imagenet', log_level=logging.INFO)
with g.as_default():
for idx, op in enumerate(net_list):
input_size, kernel_size, output_size, kernel_stride, padding, precision, op_type = op
input_size[0] = input_size[0] * batch_size
output_size[0] = output_size[0] * batch_size
precision = get_precision(precision)
if op_type == 0:
with g.name_scope('conv'+str(idx)):
out = create_conv(input_size, kernel_size, stride_size=kernel_stride, pad=padding, c_dtype=FQDtype.FXP16, w_dtype=precision)
# print(idx, op, out.shape)
assert out.shape[0] == output_size[0]
assert out.shape[1] == output_size[2]
assert out.shape[2] == output_size[3]
assert out.shape[3] == output_size[1]
else:
with g.name_scope('fc'+str(idx)):
out = create_fc(input_size, kernel_size, c_dtype=precision, w_dtype=precision)
# print(idx, op, out.shape)
assert out.shape[0] == output_size[0]
assert out.shape[1] == output_size[1]
# pass as parameter to function get_bench_numbers()
return g
Framework Organization
design space exploration
core logic of dnnweaver2
function get_cycles return stats of a certain op(Convolution, TypeCastOp, MatMul etc.)
given the configurations(conf_xx.ini) find the optimal loop order
hardware implementation is based on systolic array
config:
- sizeof Weight/Activation/Output SRAM
- precision of input & weight
- dataflow(OS & WS) it seems there is not much difference on sim cycles between OS and WS
data collector wrapper defined by ant developer
get_bench_numbers() -> get_cycles() -> get_conv_cycles() -> get_stats_fast() / optimize_for_order() (these 2 functions are core logic)
benchmarks.py
def get_bench_numbers(graph, sim_obj, batch_size=1, weight_stationary = False):
stats = {}
for opname, op in graph.op_registry.items():
out = sim_obj.get_cycles(op, batch_size, weight_stationary = weight_stationary)
if out is not None:
s, l = out
stats[opname] = s
return stats
simulator.py
def get_cycles(self, op, im2col=True, weight_stationary = False):
if isinstance(op, Convolution):
B, I, _, IC = op.data.shape
_, O, _, OC = op.output_tensors.shape
_, K, _, _ = op.weights.shape
_, S, _, _ = op.stride
iprec = op.data.dtype.bits
wprec = op.weights.dtype.bits
# if op.data.op is None:
# im2col = True # im2col for first layer
# else:
# im2col = False
im2col = True
return self.get_conv_cycles(K,
O,
S,
IC,
OC,
iprec,
wprec,
B,
im2col,
weight_stationary)
elif isinstance(op, MatMul):
B = op.data.shape[0]
OC, IC = op.weights.shape
iprec = op.data.dtype.bits
wprec = op.weights.dtype.bits
return self.get_FC_cycles(IC, OC, iprec, wprec, batch_size = B, weight_stationary = weight_stationary)
def get_conv_cycles(self, K, O, S, IC, OC, iprec, wprec, batch_size=1, im2col=True, weight_stationary = False):
# K: kernel
# O:
# S: stride
# IC: input channels
# OC: output channels
# self.accelerator.M
# self.accelerator.N
# + 1 for double buffering
num_O_tiles = int(math.ceil(log2(O))) + 1
num_IC_tiles = int(math.ceil(log2(IC))) + 1
num_OC_tiles = int(math.ceil(log2(math.ceil(float(OC)/self.accelerator.M)))) + 1
num_B_tiles = int(math.ceil(log2(B))) + 1
self.logger.debug('Number of O Tiles: {}'.format(num_O_tiles))
self.logger.debug('Number of IC Tiles: {}'.format(num_IC_tiles))
self.logger.debug('Number of OC Tiles: {}'.format(num_OC_tiles))
self.logger.debug('Number of B Tiles: {}'.format(num_B_tiles))
best_instructions_dict = {}
conv_params = self.accelerator, K, O, S, IC, OC, B, iprec, wprec, im2col, weight_stationary, self.get_energy_cost()
best_instructions, best_tiling, best_order = optimize_for_order(conv_params)
stats = get_stats_fast(conv_params, best_tiling, best_order, verbose=False)
search algorithm:
_optimize_for_order: iterate over different tiling size config(B, OW, OH, IC, OC) for a given loop ordering
possible loop order is generated with
loops = ['B/b', 'OW/ow', 'OH/oh', 'IC/ic', 'OC/oc']
order = set(permutations(loops))
influence of loop order
data reuse inside the PE array is maximized inside a computing tile
best_instructions, best_tiling, best_order = optimize_for_order(conv_params)
def optimize_for_order(conv_params):
# Generate permutations for the order
loops = ['B/b', 'OW/ow', 'OH/oh', 'IC/ic', 'OC/oc']
order = set(permutations(loops))
......
_bound_optimizer_method = functools.partial(_optimize_for_order, conv_params)
try:
pool = Pool(cpu_count())
results = pool.map_async(_bound_optimizer_method, order).get(10000)
pool.close()
pool.join()
def _optimize_for_order(conv_params, order_type, verbose=False):
"""
For a given ordering, optimizes tiling
Args:
conv_params: A tuple with convolution params
order_type: ordering loop
"""
acc_obj, K, O, S, IC, OC, B, iprec, wprec, im2col, weight_stationary, energy_cost = conv_params
I = (O - 1) * S + K
# We do not tile the "K" dimension and compute an entire 2-D conv at a
# time
num_O_tiles = int(math.ceil(log2(O))) + 1
num_IC_tiles = int(math.ceil(log2(IC))) + 1
# TODO: Fix?
if im2col:
num_OC_tiles = int(math.ceil(log2(OC))) + 1
else:
num_OC_tiles = int(math.ceil(log2(math.ceil(float(OC)/acc_obj.M)))) + 1
num_B_tiles = int(math.ceil(log2(B))) + 1
best_cycles = None
best_energy = None
best_tiling = None
for _b in range(num_B_tiles):
b = min(1 << _b, B)
num_b = ceil_a_by_b(B, b)
for _o in range(num_O_tiles):
ow = min(1 << _o, O)
oh = ow
num_ow = ceil_a_by_b(O, ow)
num_oh = ceil_a_by_b(O, oh)
for _ic in range(num_IC_tiles):
ic = min(1 << _ic, IC)
num_ic = ceil_a_by_b(IC, ic)
for _oc in range(num_OC_tiles):
if im2col:
oc = min((1 << _oc), OC)
else:
oc = min((1 << _oc) * acc_obj.M, OC)
num_oc = ceil_a_by_b(OC, oc)
iw = K + (ow - 1) * S
ih = K + (oh - 1) * S
tiling = {}
tiling['B/b'] = (num_b, b)
tiling['OW/ow'] = (num_ow, ow)
tiling['OH/oh'] = (num_oh, oh)
tiling['IC/ic'] = (num_ic, ic)
tiling['OC/oc'] = (num_oc, oc)
stats = get_stats_fast(conv_params, tiling, order_type, verbose=verbose)
if stats is None:
continue
cycles = stats.total_cycles
energy = stats.get_energy(energy_cost)
mem_cycles = stats.mem_stall_cycles
if best_cycles is None or best_cycles > cycles or (best_cycles == cycles and best_energy > energy):
# if best_energy is None or best_energy > energy or (best_energy == energy and best_cycles > cycles):
best_energy = energy
best_cycles = cycles
best_mem_cycles = mem_cycles
best_order = order_type
best_tiling = tiling
# if best_cycles is None:
# print('Not found')
# print(conv_params)
# stats = get_stats_fast(conv_params, tiling, order_type, verbose=True)
return (best_tiling, order_type, best_cycles, best_energy)
modeling method:
SRAM read and write modeling based on GEMM tiling, I make analysing annotation alongside relevant code
def get_stats_fast(conv_params, tiling, order_type, verbose=False):
"""
Returns cycles and memory accesses to DRAM, IBUF, OBUF, and WBUF
TODOs: Without im2col, the calculation of weight and ibuf size is inexact
"""
acc_obj, K, O, S, IC, OC, B, iprec, wprec, im2col, weight_stationary, energy_cost = conv_params
num_b, b = tiling['B/b']
num_ow, ow = tiling['OW/ow']
num_oh, oh = tiling['OH/oh']
num_ic, ic = tiling['IC/ic']
num_oc, oc = tiling['OC/oc']
kw = kh = K
# perf_factor = acc_obj.get_perf_factor(iprec, wprec)
writes = {}
reads = {}
# dnnweaver models layer pipeline
# writes and reads here stands for data movement within a single compute tile
# load all tiled weight kernels in a certain op to on chip SRAM(wbuf write)
# weight is only tiled along input channel and output channel dimmension
writes['wgt'] = \
ceil_a_by_b(K * K * ic, acc_obj.N * acc_obj.get_perf_factor(wprec)) * acc_obj.N * acc_obj.get_perf_factor(wprec) * \
oc * \
wprec
# load all tiled activation tensors in a certain op to on chip SRAM(ibuf write)
# activation tensors is only tiled along input channel and batch size dimmension
writes['act'] = ow * oh * \
ceil_a_by_b(K * K * ic, acc_obj.M * acc_obj.get_perf_factor(iprec)) * acc_obj.M * acc_obj.get_perf_factor(iprec) * \
b * iprec
oprec = 16
writes['out'] = 0# ow * oh * oc * b * oprec
reads['out'] = ow * oh * oc * b * oprec
# Skip if overutilizing resources
# TODO check bytes/bits
overflow = False
if writes['wgt'] > acc_obj.sram['wgt']*8/2:
if verbose:
print('wgt overflow: {}'.format(writes['wgt']))
print(b, ow, oh, ic, oc)
overflow = True
if writes['act'] > acc_obj.sram['act']*8/2:
if verbose:
print('act overflow')
print(b, ow, oh, ic, oc)
overflow = True
if writes['out'] > acc_obj.sram['out']*8/2:
if verbose:
print('out overflow')
print(b, ow, oh, ic, oc)
overflow = True
if overflow:
if verbose:
print('Activation size: {} bytes'.format(writes['act']/8.))
print('Weights size: {} bytes'.format(writes['wgt']/8.))
print('Output size: {} bytes'.format(writes['out']/8.))
return
max_write_size = {}
max_read_size = {}
for namespace in writes:
max_write_size[namespace] = writes[namespace]
for namespace in reads:
max_read_size[namespace] = reads[namespace]
# First the loop block optimizations
stats = Stats()
write_promote = {'wgt': True, 'act': True, 'out': True}
read_promote = {'out': True}
if verbose:
logger.debug('Initialize reads/writes')
logger.debug('\tim2col: {}'.format(im2col))
logger.debug('\tTiling: {}'.format(tiling))
logger.debug('\tReads : {}'.format(reads))
logger.debug('\tWrites: {}'.format(writes))
# calculate the data movement from on-chip SRAM to off-chip DRAM according to LOOP ORDER DEPENDENCY and on-chip SRAM size budget(in configuration file)
for loop in reversed(order_type):
num_tiles, tile_size = tiling[loop]
# promote all writes
for namespace in writes:
# promote is true
if write_promote[namespace]:
# If tile loop depends on the namespace index, make the read size larger
if tile_deps[loop][namespace]:
writes[namespace] *= num_tiles
# If tile size is larger than the SRAM, set promote to False
if writes[namespace] > acc_obj.sram[namespace]*8./2:
write_promote[namespace] = False
else:
max_write_size[namespace] = writes[namespace]
else:
writes[namespace] *= num_tiles
# promote all reads
for namespace in reads:
# promote is true
if read_promote[namespace]:
# Tile loop depends on the namespace index
if tile_deps[loop][namespace]:
reads[namespace] *= num_tiles
# Tile size is now larger than the SRAM, set promote to False
if reads[namespace] > acc_obj.sram[namespace]*8./2:
read_promote[namespace] = False
else:
max_read_size[namespace] = writes[namespace]
else:
reads[namespace] *= num_tiles
if verbose:
logger.debug('Loop: {}'.format(loop))
logger.debug('\tLoop range: {}'.format(tiling[loop]))
logger.debug('\tMax write size: {}'.format(max_write_size))
logger.debug('\tMax read size: {}'.format(max_read_size))
logger.debug('\tLoop Dependencies: {}'.format(tile_deps[loop]))
logger.debug('\tLoop Promote: {}'.format(write_promote))
logger.debug('\tReads : {}'.format(reads))
logger.debug('\tWrites: {}'.format(writes))
for namespace in writes:
stats.writes[namespace] = writes[namespace]
stats.reads['dram'] += writes[namespace]
for namespace in reads:
stats.reads[namespace] = reads[namespace]
stats.writes['dram'] += reads[namespace]
# Next the inner loop optimizations
num_tiles = num_b * num_ow * num_oh * num_ic * num_oc
if weight_stationary:
if verbose:
logger.debug('SRAM access order: Weight Stationary')
stats.reads['act'] += num_tiles * (kw * kh * ic * oc) * (b * ow * oh) * iprec
stats.reads['out'] += num_tiles * (kw * kh * ic * oc) * (b * ow * oh) * oprec
stats.writes['out'] += num_tiles * (kw * kh * ic * oc) * (b * ow * oh) * oprec
stats.reads['wgt'] += num_tiles * (kw * kh * ic * oc) * wprec
else:
if verbose:
logger.debug('SRAM access order: Output Stationary')
stats.reads['act'] += num_tiles * (oc * oh * ow * b) * (kw * kh * ic) * iprec
stats.writes['out'] += num_tiles * (oc * oh * ow * b) * oprec
stats.reads['wgt'] += num_tiles * (oc * oh * ow * b) * (kw * kh * ic) * wprec
stats.reads['out'] += num_tiles * (oc * oh * ow * b) * oprec
initial_dram_reads = 0
final_dram_writes = 0
for namespace in max_write_size:
initial_dram_reads += max_write_size[namespace]
for namespace in max_read_size:
final_dram_writes += max_read_size[namespace]
latency = acc_obj.get_mem_read_cycles('dram', initial_dram_reads) + \
acc_obj.get_mem_write_cycles('dram', final_dram_writes)
total_dram_accesses = stats.reads['dram'] + stats.writes['dram']
middle_dram_accesses = total_dram_accesses - initial_dram_reads - final_dram_writes
if weight_stationary:
compute_cycles = num_tiles * acc_obj.get_compute_cycles(ic, oc, ow, oh, b, kw, kh, iprec, wprec, im2col)
else:
compute_cycles = num_tiles * acc_obj.get_compute_cycles_output_stationary(ic, oc, ow, oh, b, kw, kh, iprec, wprec, im2col)
memory_cycles_required = ceil_a_by_b(middle_dram_accesses, acc_obj.mem_if_width)
memory_stalls = max(0, memory_cycles_required - compute_cycles) + latency
stats.total_cycles = compute_cycles + memory_stalls
stats.mem_stall_cycles = memory_stalls
if verbose:
logger.debug('Compute cycles : {:>20,}'.format(compute_cycles))
logger.debug('Memory cycles : {:>20,}'.format(memory_cycles_required + latency))
logger.debug('Memory stalls : {:>20,}'.format(memory_stalls))
return stats
___cycle accurate simulation___
method function get_compute_cycle in class Accelerator only estimate compute cycles
function get_loop_instructions provide cycle accurate simulation according to previosly determined tiling size and loop order by inserting instructions
get_loop_instructions
......
instruction_ordered = LoopStack()
......
sim cycle statistic API
function optimize_for_order() return get_loop_instruction() which returns class LoopStack()
loop_stack.py
class LoopStack(object):
......
def get_stats(self, acc_obj, verbose=False):
main component(python class)
class Simulator
class SimulatorSweep
class Accelerator
Conclusion
the cycle accurate modeling of dnnweaver2 appears to be coarse-grained. It is driven by ISA event(loop order)
In the future, I plan to learn Timeloop/Accelergy