网站建设推销员话术,从哪些方面评价一个企业的网站建设,白云区是穷人区吗,推广手段有哪些Apache TVM 是一个端到端的深度学习编译框架#xff0c;适用于 CPU、GPU 和各种机器学习加速芯片。更多 TVM 中文文档可访问 → https://tvm.hyper.ai/
作者#xff1a;Masahiro Masuda#xff0c;Truman Tian
本文介绍如何将 cuDNN 或 cuBLAS 等外部库与 Relay 一起使用。…Apache TVM 是一个端到端的深度学习编译框架适用于 CPU、GPU 和各种机器学习加速芯片。更多 TVM 中文文档可访问 → https://tvm.hyper.ai/
作者Masahiro MasudaTruman Tian
本文介绍如何将 cuDNN 或 cuBLAS 等外部库与 Relay 一起使用。
Relay 内部用 TVM 来生成 target-specific 的代码。例如TVM 使用 CUDA 后端为用户提供的网络中的所有层生成 CUDA 内核。有时也可将各个供应商开发的外部库合并到 Relay 中TVM 有一种机制可以透明地调用这些库——对于 Relay 用户只需要设置一个适当的 target 字符串。
使用 Relay 的外部库前用你要用的库构建 TVM。例如要用 cuDNN需启用 cmake/config.cmake 中的 USE_CUDNN 选项必要时要指定 cuDNN 头文件和库目录。
首先导入 Relay 和 TVM。
import tvm
from tvm import te
import numpy as np
from tvm.contrib import graph_executor as runtime
from tvm import relay
from tvm.relay import testing
import tvm.testing创建一个简单网络
下面创建一个简单网络进行演示它由 convolutionbatch normalization 和 ReLU activation 组成。
out_channels 16
batch_size 1data relay.var(data, relay.TensorType((batch_size, 3, 224, 224), float32))
weight relay.var(weight)
bn_gamma relay.var(bn_gamma)
bn_beta relay.var(bn_beta)
bn_mmean relay.var(bn_mean)
bn_mvar relay.var(bn_var)simple_net relay.nn.conv2d(datadata, weightweight, kernel_size(3, 3), channelsout_channels, padding(1, 1)
)
simple_net relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
simple_net relay.nn.relu(simple_net)
simple_net relay.Function(relay.analysis.free_vars(simple_net), simple_net)data_shape (batch_size, 3, 224, 224)
net, params testing.create_workload(simple_net)使用 CUDA 后端构建和运行
正常使用 CUDA 后端构建和运行这个网络。设置日志记录级别为 DEBUGRelay 计算图编译的结果将作为伪代码转储。
import logginglogging.basicConfig(levellogging.DEBUG) # to dump TVM IR after fusiontarget cuda
lib relay.build_module.build(net, target, paramsparams)dev tvm.device(target, 0)
data np.random.uniform(-1, 1, sizedata_shape).astype(float32)
module runtime.GraphModule(lib[default](dev))
module.set_input(data, data)
module.run()
out_shape (batch_size, out_channels, 224, 224)
out module.get_output(0, tvm.nd.empty(out_shape))
out_cuda out.numpy()输出结果
/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, hosttarget_host) instead.target_host parameter is going to be deprecated. 生成的伪代码应如下。注意 bias addbatch normalization 和 ReLU activation 是如何融合到卷积核中的。 TVM 从这个表示中生成一个单一的融合内核。
produce tensor {// attr [iter_var(blockIdx.z, , blockIdx.z)] thread_extent 1// attr [compute] storage_scope localallocate compute[float32 * 32]// attr [pad_temp.shared] storage_scope sharedallocate pad_temp.shared[float32 * 180]// attr [placeholder.shared] storage_scope sharedallocate placeholder.shared[float32 * 144]// attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent 28// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent 14// attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent 4// attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent 1// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent 16produce compute {compute[0] 0.000000fcompute[1] 0.000000fcompute[2] 0.000000fcompute[3] 0.000000fcompute[4] 0.000000fcompute[5] 0.000000fcompute[6] 0.000000fcompute[7] 0.000000fcompute[8] 0.000000fcompute[9] 0.000000fcompute[10] 0.000000fcompute[11] 0.000000fcompute[12] 0.000000fcompute[13] 0.000000fcompute[14] 0.000000fcompute[15] 0.000000fcompute[16] 0.000000fcompute[17] 0.000000fcompute[18] 0.000000fcompute[19] 0.000000fcompute[20] 0.000000fcompute[21] 0.000000fcompute[22] 0.000000fcompute[23] 0.000000fcompute[24] 0.000000fcompute[25] 0.000000fcompute[26] 0.000000fcompute[27] 0.000000fcompute[28] 0.000000fcompute[29] 0.000000fcompute[30] 0.000000fcompute[31] 0.000000ffor (rc.outer, 0, 3) {produce pad_temp.shared {// attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent 4// attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent 1// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent 16if (likely(((threadIdx.z*15) (60 - threadIdx.x)))) {if (likely((threadIdx.x 15))) {pad_temp.shared[(((((threadIdx.z*15) threadIdx.x)/60)*180) ((((((threadIdx.z*15) threadIdx.x)/6) % 10)*18) ((((threadIdx.z*3) threadIdx.x)*3) % 18)))] tvm_if_then_else((((((1 - ((((threadIdx.z*15) threadIdx.x)/6) % 10)) (blockIdx.y*8)) ((blockIdx.y*8) (225 - ((((threadIdx.z*15) threadIdx.x)/6) % 10)))) ((1 - ((((threadIdx.z*3) threadIdx.x)*3) % 18)) (blockIdx.x*16))) ((blockIdx.x*16) (225 - ((((threadIdx.z*3) threadIdx.x)*3) % 18)))), placeholder[((((((((blockIdx.y*112) blockIdx.x) (rc.outer*3136)) ((((threadIdx.z*15) threadIdx.x)/60)*9408))*16) ((((threadIdx.z*3) threadIdx.x)*3) % 18)) (((((threadIdx.z*15) threadIdx.x)/6) % 10)*224)) -225)], 0.000000f)pad_temp.shared[(((((((threadIdx.z*15) threadIdx.x)*3) 1)/180)*180) ((((((((threadIdx.z*15) threadIdx.x)*3) 1)/18) % 10)*18) (((((threadIdx.z*3) threadIdx.x)*3) 1) % 18)))] tvm_if_then_else((((((1 - ((((((threadIdx.z*15) threadIdx.x)*3) 1)/18) % 10)) (blockIdx.y*8)) ((blockIdx.y*8) (225 - ((((((threadIdx.z*15) threadIdx.x)*3) 1)/18) % 10)))) ((1 - (((((threadIdx.z*3) threadIdx.x)*3) 1) % 18)) (blockIdx.x*16))) ((blockIdx.x*16) (225 - (((((threadIdx.z*3) threadIdx.x)*3) 1) % 18)))), placeholder[((((((((blockIdx.y*112) blockIdx.x) (rc.outer*3136)) ((((((threadIdx.z*15) threadIdx.x)*3) 1)/180)*9408))*16) (((((threadIdx.z*3) threadIdx.x)*3) 1) % 18)) (((((((threadIdx.z*15) threadIdx.x)*3) 1)/18) % 10)*224)) -225)], 0.000000f)pad_temp.shared[(((((((threadIdx.z*15) threadIdx.x)*3) 2)/180)*180) ((((((((threadIdx.z*15) threadIdx.x)*3) 2)/18) % 10)*18) (((((threadIdx.z*3) threadIdx.x)*3) 2) % 18)))] tvm_if_then_else((((((1 - ((((((threadIdx.z*15) threadIdx.x)*3) 2)/18) % 10)) (blockIdx.y*8)) ((blockIdx.y*8) (225 - ((((((threadIdx.z*15) threadIdx.x)*3) 2)/18) % 10)))) ((1 - (((((threadIdx.z*3) threadIdx.x)*3) 2) % 18)) (blockIdx.x*16))) ((blockIdx.x*16) (225 - (((((threadIdx.z*3) threadIdx.x)*3) 2) % 18)))), placeholder[((((((((blockIdx.y*112) blockIdx.x) (rc.outer*3136)) ((((((threadIdx.z*15) threadIdx.x)*3) 2)/180)*9408))*16) (((((threadIdx.z*3) threadIdx.x)*3) 2) % 18)) (((((((threadIdx.z*15) threadIdx.x)*3) 2)/18) % 10)*224)) -225)], 0.000000f)}}}produce placeholder.shared {// attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent 4// attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent 1// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent 16if (likely(((threadIdx.z*4) (16 - (threadIdx.x/3))))) {if (likely(((threadIdx.z*12) (48 - threadIdx.x)))) {if (likely((threadIdx.x 12))) {placeholder.shared[(((((threadIdx.z*4) (threadIdx.x/3))*3) (threadIdx.x % 3))*3)] placeholder[(((((rc.outer (threadIdx.z*12)) ((threadIdx.x/3)*3))*3) (threadIdx.x % 3))*3)]placeholder.shared[((((((threadIdx.z*4) (threadIdx.x/3))*3) (threadIdx.x % 3))*3) 1)] placeholder[((((((rc.outer (threadIdx.z*12)) ((threadIdx.x/3)*3))*3) (threadIdx.x % 3))*3) 1)]placeholder.shared[((((((threadIdx.z*4) (threadIdx.x/3))*3) (threadIdx.x % 3))*3) 2)] placeholder[((((((rc.outer (threadIdx.z*12)) ((threadIdx.x/3)*3))*3) (threadIdx.x % 3))*3) 2)]}}}}compute[0] (compute[0] (pad_temp.shared[threadIdx.x]*placeholder.shared[(threadIdx.z*36)]))compute[1] (compute[1] (pad_temp.shared[(threadIdx.x 18)]*placeholder.shared[(threadIdx.z*36)]))compute[2] (compute[2] (pad_temp.shared[(threadIdx.x 36)]*placeholder.shared[(threadIdx.z*36)]))compute[3] (compute[3] (pad_temp.shared[(threadIdx.x 54)]*placeholder.shared[(threadIdx.z*36)]))compute[4] (compute[4] (pad_temp.shared[(threadIdx.x 72)]*placeholder.shared[(threadIdx.z*36)]))compute[5] (compute[5] (pad_temp.shared[(threadIdx.x 90)]*placeholder.shared[(threadIdx.z*36)]))compute[6] (compute[6] (pad_temp.shared[(threadIdx.x 108)]*placeholder.shared[(threadIdx.z*36)]))compute[7] (compute[7] (pad_temp.shared[(threadIdx.x 126)]*placeholder.shared[(threadIdx.z*36)]))compute[8] (compute[8] (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) 9)]))compute[9] (compute[9] (pad_temp.shared[(threadIdx.x 18)]*placeholder.shared[((threadIdx.z*36) 9)]))compute[10] (compute[10] (pad_temp.shared[(threadIdx.x 36)]*placeholder.shared[((threadIdx.z*36) 9)]))compute[11] (compute[11] (pad_temp.shared[(threadIdx.x 54)]*placeholder.shared[((threadIdx.z*36) 9)]))compute[12] (compute[12] (pad_temp.shared[(threadIdx.x 72)]*placeholder.shared[((threadIdx.z*36) 9)]))compute[13] (compute[13] (pad_temp.shared[(threadIdx.x 90)]*placeholder.shared[((threadIdx.z*36) 9)]))compute[14] (compute[14] (pad_temp.shared[(threadIdx.x 108)]*placeholder.shared[((threadIdx.z*36) 9)]))compute[15] (compute[15] (pad_temp.shared[(threadIdx.x 126)]*placeholder.shared[((threadIdx.z*36) 9)]))compute[16] (compute[16] (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) 18)]))compute[17] (compute[17] (pad_temp.shared[(threadIdx.x 18)]*placeholder.shared[((threadIdx.z*36) 18)]))compute[18] (compute[18] (pad_temp.shared[(threadIdx.x 36)]*placeholder.shared[((threadIdx.z*36) 18)]))compute[19] (compute[19] (pad_temp.shared[(threadIdx.x 54)]*placeholder.shared[((threadIdx.z*36) 18)]))compute[20] (compute[20] (pad_temp.shared[(threadIdx.x 72)]*placeholder.shared[((threadIdx.z*36) 18)]))compute[21] (compute[21] (pad_temp.shared[(threadIdx.x 90)]*placeholder.shared[((threadIdx.z*36) 18)]))compute[22] (compute[22] (pad_temp.shared[(threadIdx.x 108)]*placeholder.shared[((threadIdx.z*36) 18)]))compute[23] (compute[23] (pad_temp.shared[(threadIdx.x 126)]*placeholder.shared[((threadIdx.z*36) 18)]))compute[24] (compute[24] (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) 27)]))compute[25] (compute[25] (pad_temp.shared[(threadIdx.x 18)]*placeholder.shared[((threadIdx.z*36) 27)]))compute[26] (compute[26] (pad_temp.shared[(threadIdx.x 36)]*placeholder.shared[((threadIdx.z*36) 27)]))compute[27] (compute[27] (pad_temp.shared[(threadIdx.x 54)]*placeholder.shared[((threadIdx.z*36) 27)]))compute[28] (compute[28] (pad_temp.shared[(threadIdx.x 72)]*placeholder.shared[((threadIdx.z*36) 27)]))compute[29] (compute[29] (pad_temp.shared[(threadIdx.x 90)]*placeholder.shared[((threadIdx.z*36) 27)]))compute[30] (compute[30] (pad_temp.shared[(threadIdx.x 108)]*placeholder.shared[((threadIdx.z*36) 27)]))compute[31] (compute[31] (pad_temp.shared[(threadIdx.x 126)]*placeholder.shared[((threadIdx.z*36) 27)]))compute[0] (compute[0] (pad_temp.shared[(threadIdx.x 1)]*placeholder.shared[((threadIdx.z*36) 1)]))compute[1] (compute[1] (pad_temp.shared[(threadIdx.x 19)]*placeholder.shared[((threadIdx.z*36) 1)]))compute[2] (compute[2] (pad_temp.shared[(threadIdx.x 37)]*placeholder.shared[((threadIdx.z*36) 1)]))compute[3] (compute[3] (pad_temp.shared[(threadIdx.x 55)]*placeholder.shared[((threadIdx.z*36) 1)]))compute[4] (compute[4] (pad_temp.shared[(threadIdx.x 73)]*placeholder.shared[((threadIdx.z*36) 1)]))compute[5] (compute[5] (pad_temp.shared[(threadIdx.x 91)]*placeholder.shared[((threadIdx.z*36) 1)]))compute[6] (compute[6] (pad_temp.shared[(threadIdx.x 109)]*placeholder.shared[((threadIdx.z*36) 1)]))compute[7] (compute[7] (pad_temp.shared[(threadIdx.x 127)]*placeholder.shared[((threadIdx.z*36) 1)]))compute[8] (compute[8] (pad_temp.shared[(threadIdx.x 1)]*placeholder.shared[((threadIdx.z*36) 10)]))compute[9] (compute[9] (pad_temp.shared[(threadIdx.x 19)]*placeholder.shared[((threadIdx.z*36) 10)]))compute[10] (compute[10] (pad_temp.shared[(threadIdx.x 37)]*placeholder.shared[((threadIdx.z*36) 10)]))compute[11] (compute[11] (pad_temp.shared[(threadIdx.x 55)]*placeholder.shared[((threadIdx.z*36) 10)]))compute[12] (compute[12] (pad_temp.shared[(threadIdx.x 73)]*placeholder.shared[((threadIdx.z*36) 10)]))compute[13] (compute[13] (pad_temp.shared[(threadIdx.x 91)]*placeholder.shared[((threadIdx.z*36) 10)]))compute[14] (compute[14] (pad_temp.shared[(threadIdx.x 109)]*placeholder.shared[((threadIdx.z*36) 10)]))compute[15] (compute[15] (pad_temp.shared[(threadIdx.x 127)]*placeholder.shared[((threadIdx.z*36) 10)]))compute[16] (compute[16] (pad_temp.shared[(threadIdx.x 1)]*placeholder.shared[((threadIdx.z*36) 19)]))compute[17] (compute[17] (pad_temp.shared[(threadIdx.x 19)]*placeholder.shared[((threadIdx.z*36) 19)]))compute[18] (compute[18] (pad_temp.shared[(threadIdx.x 37)]*placeholder.shared[((threadIdx.z*36) 19)]))compute[19] (compute[19] (pad_temp.shared[(threadIdx.x 55)]*placeholder.shared[((threadIdx.z*36) 19)]))compute[20] (compute[20] (pad_temp.shared[(threadIdx.x 73)]*placeholder.shared[((threadIdx.z*36) 19)]))compute[21] (compute[21] (pad_temp.shared[(threadIdx.x 91)]*placeholder.shared[((threadIdx.z*36) 19)]))compute[22] (compute[22] (pad_temp.shared[(threadIdx.x 109)]*placeholder.shared[((threadIdx.z*36) 19)]))compute[23] (compute[23] (pad_temp.shared[(threadIdx.x 127)]*placeholder.shared[((threadIdx.z*36) 19)]))compute[24] (compute[24] (pad_temp.shared[(threadIdx.x 1)]*placeholder.shared[((threadIdx.z*36) 28)]))compute[25] (compute[25] (pad_temp.shared[(threadIdx.x 19)]*placeholder.shared[((threadIdx.z*36) 28)]))compute[26] (compute[26] (pad_temp.shared[(threadIdx.x 37)]*placeholder.shared[((threadIdx.z*36) 28)]))compute[27] (compute[27] (pad_temp.shared[(threadIdx.x 55)]*placeholder.shared[((threadIdx.z*36) 28)]))compute[28] (compute[28] (pad_temp.shared[(threadIdx.x 73)]*placeholder.shared[((threadIdx.z*36) 28)]))compute[29] (compute[29] (pad_temp.shared[(threadIdx.x 91)]*placeholder.shared[((threadIdx.z*36) 28)]))compute[30] (compute[30] (pad_temp.shared[(threadIdx.x 109)]*placeholder.shared[((threadIdx.z*36) 28)]))compute[31] (compute[31] (pad_temp.shared[(threadIdx.x 127)]*placeholder.shared[((threadIdx.z*36) 28)]))compute[0] (compute[0] (pad_temp.shared[(threadIdx.x 2)]*placeholder.shared[((threadIdx.z*36) 2)]))compute[1] (compute[1] (pad_temp.shared[(threadIdx.x 20)]*placeholder.shared[((threadIdx.z*36) 2)]))compute[2] (compute[2] (pad_temp.shared[(threadIdx.x 38)]*placeholder.shared[((threadIdx.z*36) 2)]))compute[3] (compute[3] (pad_temp.shared[(threadIdx.x 56)]*placeholder.shared[((threadIdx.z*36) 2)]))compute[4] (compute[4] (pad_temp.shared[(threadIdx.x 74)]*placeholder.shared[((threadIdx.z*36) 2)]))compute[5] (compute[5] (pad_temp.shared[(threadIdx.x 92)]*placeholder.shared[((threadIdx.z*36) 2)]))compute[6] (compute[6] (pad_temp.shared[(threadIdx.x 110)]*placeholder.shared[((threadIdx.z*36) 2)]))compute[7] (compute[7] (pad_temp.shared[(threadIdx.x 128)]*placeholder.shared[((threadIdx.z*36) 2)]))compute[8] (compute[8] (pad_temp.shared[(threadIdx.x 2)]*placeholder.shared[((threadIdx.z*36) 11)]))compute[9] (compute[9] (pad_temp.shared[(threadIdx.x 20)]*placeholder.shared[((threadIdx.z*36) 11)]))compute[10] (compute[10] (pad_temp.shared[(threadIdx.x 38)]*placeholder.shared[((threadIdx.z*36) 11)]))compute[11] (compute[11] (pad_temp.shared[(threadIdx.x 56)]*placeholder.shared[((threadIdx.z*36) 11)]))compute[12] (compute[12] (pad_temp.shared[(threadIdx.x 74)]*placeholder.shared[((threadIdx.z*36) 11)]))compute[13] (compute[13] (pad_temp.shared[(threadIdx.x 92)]*placeholder.shared[((threadIdx.z*36) 11)]))compute[14] (compute[14] (pad_temp.shared[(threadIdx.x 110)]*placeholder.shared[((threadIdx.z*36) 11)]))compute[15] (compute[15] (pad_temp.shared[(threadIdx.x 128)]*placeholder.shared[((threadIdx.z*36) 11)]))compute[16] (compute[16] (pad_temp.shared[(threadIdx.x 2)]*placeholder.shared[((threadIdx.z*36) 20)]))compute[17] (compute[17] (pad_temp.shared[(threadIdx.x 20)]*placeholder.shared[((threadIdx.z*36) 20)]))compute[18] (compute[18] (pad_temp.shared[(threadIdx.x 38)]*placeholder.shared[((threadIdx.z*36) 20)]))compute[19] (compute[19] (pad_temp.shared[(threadIdx.x 56)]*placeholder.shared[((threadIdx.z*36) 20)]))compute[20] (compute[20] (pad_temp.shared[(threadIdx.x 74)]*placeholder.shared[((threadIdx.z*36) 20)]))compute[21] (compute[21] (pad_temp.shared[(threadIdx.x 92)]*placeholder.shared[((threadIdx.z*36) 20)]))compute[22] (compute[22] (pad_temp.shared[(threadIdx.x 110)]*placeholder.shared[((threadIdx.z*36) 20)]))compute[23] (compute[23] (pad_temp.shared[(threadIdx.x 128)]*placeholder.shared[((threadIdx.z*36) 20)]))compute[24] (compute[24] (pad_temp.shared[(threadIdx.x 2)]*placeholder.shared[((threadIdx.z*36) 29)]))compute[25] (compute[25] (pad_temp.shared[(threadIdx.x 20)]*placeholder.shared[((threadIdx.z*36) 29)]))compute[26] (compute[26] (pad_temp.shared[(threadIdx.x 38)]*placeholder.shared[((threadIdx.z*36) 29)]))compute[27] (compute[27] (pad_temp.shared[(threadIdx.x 56)]*placeholder.shared[((threadIdx.z*36) 29)]))compute[28] (compute[28] (pad_temp.shared[(threadIdx.x 74)]*placeholder.shared[((threadIdx.z*36) 29)]))compute[29] (compute[29] (pad_temp.shared[(threadIdx.x 92)]*placeholder.shared[((threadIdx.z*36) 29)]))compute[30] (compute[30] (pad_temp.shared[(threadIdx.x 110)]*placeholder.shared[((threadIdx.z*36) 29)]))compute[31] (compute[31] (pad_temp.shared[(threadIdx.x 128)]*placeholder.shared[((threadIdx.z*36) 29)]))compute[0] (compute[0] (pad_temp.shared[(threadIdx.x 18)]*placeholder.shared[((threadIdx.z*36) 3)]))compute[1] (compute[1] (pad_temp.shared[(threadIdx.x 36)]*placeholder.shared[((threadIdx.z*36) 3)]))compute[2] (compute[2] (pad_temp.shared[(threadIdx.x 54)]*placeholder.shared[((threadIdx.z*36) 3)]))compute[3] (compute[3] (pad_temp.shared[(threadIdx.x 72)]*placeholder.shared[((threadIdx.z*36) 3)]))compute[4] (compute[4] (pad_temp.shared[(threadIdx.x 90)]*placeholder.shared[((threadIdx.z*36) 3)]))compute[5] (compute[5] (pad_temp.shared[(threadIdx.x 108)]*placeholder.shared[((threadIdx.z*36) 3)]))compute[6] (compute[6] (pad_temp.shared[(threadIdx.x 126)]*placeholder.shared[((threadIdx.z*36) 3)]))compute[7] (compute[7] (pad_temp.shared[(threadIdx.x 144)]*placeholder.shared[((threadIdx.z*36) 3)]))compute[8] (compute[8] (pad_temp.shared[(threadIdx.x 18)]*placeholder.shared[((threadIdx.z*36) 12)]))compute[9] (compute[9] (pad_temp.shared[(threadIdx.x 36)]*placeholder.shared[((threadIdx.z*36) 12)]))compute[10] (compute[10] (pad_temp.shared[(threadIdx.x 54)]*placeholder.shared[((threadIdx.z*36) 12)]))compute[11] (compute[11] (pad_temp.shared[(threadIdx.x 72)]*placeholder.shared[((threadIdx.z*36) 12)]))compute[12] (compute[12] (pad_temp.shared[(threadIdx.x 90)]*placeholder.shared[((threadIdx.z*36) 12)]))compute[13] (compute[13] (pad_temp.shared[(threadIdx.x 108)]*placeholder.shared[((threadIdx.z*36) 12)]))compute[14] (compute[14] (pad_temp.shared[(threadIdx.x 126)]*placeholder.shared[((threadIdx.z*36) 12)]))compute[15] (compute[15] (pad_temp.shared[(threadIdx.x 144)]*placeholder.shared[((threadIdx.z*36) 12)]))compute[16] (compute[16] (pad_temp.shared[(threadIdx.x 18)]*placeholder.shared[((threadIdx.z*36) 21)]))compute[17] (compute[17] (pad_temp.shared[(threadIdx.x 36)]*placeholder.shared[((threadIdx.z*36) 21)]))compute[18] (compute[18] (pad_temp.shared[(threadIdx.x 54)]*placeholder.shared[((threadIdx.z*36) 21)]))compute[19] (compute[19] (pad_temp.shared[(threadIdx.x 72)]*placeholder.shared[((threadIdx.z*36) 21)]))compute[20] (compute[20] (pad_temp.shared[(threadIdx.x 90)]*placeholder.shared[((threadIdx.z*36) 21)]))compute[21] (compute[21] (pad_temp.shared[(threadIdx.x 108)]*placeholder.shared[((threadIdx.z*36) 21)]))compute[22] (compute[22] (pad_temp.shared[(threadIdx.x 126)]*placeholder.shared[((threadIdx.z*36) 21)]))compute[23] (compute[23] (pad_temp.shared[(threadIdx.x 144)]*placeholder.shared[((threadIdx.z*36) 21)]))compute[24] (compute[24] (pad_temp.shared[(threadIdx.x 18)]*placeholder.shared[((threadIdx.z*36) 30)]))compute[25] (compute[25] (pad_temp.shared[(threadIdx.x 36)]*placeholder.shared[((threadIdx.z*36) 30)]))compute[26] (compute[26] (pad_temp.shared[(threadIdx.x 54)]*placeholder.shared[((threadIdx.z*36) 30)]))compute[27] (compute[27] (pad_temp.shared[(threadIdx.x 72)]*placeholder.shared[((threadIdx.z*36) 30)]))compute[28] (compute[28] (pad_temp.shared[(threadIdx.x 90)]*placeholder.shared[((threadIdx.z*36) 30)]))compute[29] (compute[29] (pad_temp.shared[(threadIdx.x 108)]*placeholder.shared[((threadIdx.z*36) 30)]))compute[30] (compute[30] (pad_temp.shared[(threadIdx.x 126)]*placeholder.shared[((threadIdx.z*36) 30)]))compute[31] (compute[31] (pad_temp.shared[(threadIdx.x 144)]*placeholder.shared[((threadIdx.z*36) 30)]))compute[0] (compute[0] (pad_temp.shared[(threadIdx.x 19)]*placeholder.shared[((threadIdx.z*36) 4)]))compute[1] (compute[1] (pad_temp.shared[(threadIdx.x 37)]*placeholder.shared[((threadIdx.z*36) 4)]))compute[2] (compute[2] (pad_temp.shared[(threadIdx.x 55)]*placeholder.shared[((threadIdx.z*36) 4)]))compute[3] (compute[3] (pad_temp.shared[(threadIdx.x 73)]*placeholder.shared[((threadIdx.z*36) 4)]))compute[4] (compute[4] (pad_temp.shared[(threadIdx.x 91)]*placeholder.shared[((threadIdx.z*36) 4)]))compute[5] (compute[5] (pad_temp.shared[(threadIdx.x 109)]*placeholder.shared[((threadIdx.z*36) 4)]))compute[6] (compute[6] (pad_temp.shared[(threadIdx.x 127)]*placeholder.shared[((threadIdx.z*36) 4)]))compute[7] (compute[7] (pad_temp.shared[(threadIdx.x 145)]*placeholder.shared[((threadIdx.z*36) 4)]))compute[8] (compute[8] (pad_temp.shared[(threadIdx.x 19)]*placeholder.shared[((threadIdx.z*36) 13)]))compute[9] (compute[9] (pad_temp.shared[(threadIdx.x 37)]*placeholder.shared[((threadIdx.z*36) 13)]))compute[10] (compute[10] (pad_temp.shared[(threadIdx.x 55)]*placeholder.shared[((threadIdx.z*36) 13)]))compute[11] (compute[11] (pad_temp.shared[(threadIdx.x 73)]*placeholder.shared[((threadIdx.z*36) 13)]))compute[12] (compute[12] (pad_temp.shared[(threadIdx.x 91)]*placeholder.shared[((threadIdx.z*36) 13)]))compute[13] (compute[13] (pad_temp.shared[(threadIdx.x 109)]*placeholder.shared[((threadIdx.z*36) 13)]))compute[14] (compute[14] (pad_temp.shared[(threadIdx.x 127)]*placeholder.shared[((threadIdx.z*36) 13)]))compute[15] (compute[15] (pad_temp.shared[(threadIdx.x 145)]*placeholder.shared[((threadIdx.z*36) 13)]))compute[16] (compute[16] (pad_temp.shared[(threadIdx.x 19)]*placeholder.shared[((threadIdx.z*36) 22)]))compute[17] (compute[17] (pad_temp.shared[(threadIdx.x 37)]*placeholder.shared[((threadIdx.z*36) 22)]))compute[18] (compute[18] (pad_temp.shared[(threadIdx.x 55)]*placeholder.shared[((threadIdx.z*36) 22)]))compute[19] (compute[19] (pad_temp.shared[(threadIdx.x 73)]*placeholder.shared[((threadIdx.z*36) 22)]))compute[20] (compute[20] (pad_temp.shared[(threadIdx.x 91)]*placeholder.shared[((threadIdx.z*36) 22)]))compute[21] (compute[21] (pad_temp.shared[(threadIdx.x 109)]*placeholder.shared[((threadIdx.z*36) 22)]))compute[22] (compute[22] (pad_temp.shared[(threadIdx.x 127)]*placeholder.shared[((threadIdx.z*36) 22)]))compute[23] (compute[23] (pad_temp.shared[(threadIdx.x 145)]*placeholder.shared[((threadIdx.z*36) 22)]))compute[24] (compute[24] (pad_temp.shared[(threadIdx.x 19)]*placeholder.shared[((threadIdx.z*36) 31)]))compute[25] (compute[25] (pad_temp.shared[(threadIdx.x 37)]*placeholder.shared[((threadIdx.z*36) 31)]))compute[26] (compute[26] (pad_temp.shared[(threadIdx.x 55)]*placeholder.shared[((threadIdx.z*36) 31)]))compute[27] (compute[27] (pad_temp.shared[(threadIdx.x 73)]*placeholder.shared[((threadIdx.z*36) 31)]))compute[28] (compute[28] (pad_temp.shared[(threadIdx.x 91)]*placeholder.shared[((threadIdx.z*36) 31)]))compute[29] (compute[29] (pad_temp.shared[(threadIdx.x 109)]*placeholder.shared[((threadIdx.z*36) 31)]))compute[30] (compute[30] (pad_temp.shared[(threadIdx.x 127)]*placeholder.shared[((threadIdx.z*36) 31)]))compute[31] (compute[31] (pad_temp.shared[(threadIdx.x 145)]*placeholder.shared[((threadIdx.z*36) 31)]))compute[0] (compute[0] (pad_temp.shared[(threadIdx.x 20)]*placeholder.shared[((threadIdx.z*36) 5)]))compute[1] (compute[1] (pad_temp.shared[(threadIdx.x 38)]*placeholder.shared[((threadIdx.z*36) 5)]))compute[2] (compute[2] (pad_temp.shared[(threadIdx.x 56)]*placeholder.shared[((threadIdx.z*36) 5)]))compute[3] (compute[3] (pad_temp.shared[(threadIdx.x 74)]*placeholder.shared[((threadIdx.z*36) 5)]))compute[4] (compute[4] (pad_temp.shared[(threadIdx.x 92)]*placeholder.shared[((threadIdx.z*36) 5)]))compute[5] (compute[5] (pad_temp.shared[(threadIdx.x 110)]*placeholder.shared[((threadIdx.z*36) 5)]))compute[6] (compute[6] (pad_temp.shared[(threadIdx.x 128)]*placeholder.shared[((threadIdx.z*36) 5)]))compute[7] (compute[7] (pad_temp.shared[(threadIdx.x 146)]*placeholder.shared[((threadIdx.z*36) 5)]))compute[8] (compute[8] (pad_temp.shared[(threadIdx.x 20)]*placeholder.shared[((threadIdx.z*36) 14)]))compute[9] (compute[9] (pad_temp.shared[(threadIdx.x 38)]*placeholder.shared[((threadIdx.z*36) 14)]))compute[10] (compute[10] (pad_temp.shared[(threadIdx.x 56)]*placeholder.shared[((threadIdx.z*36) 14)]))compute[11] (compute[11] (pad_temp.shared[(threadIdx.x 74)]*placeholder.shared[((threadIdx.z*36) 14)]))compute[12] (compute[12] (pad_temp.shared[(threadIdx.x 92)]*placeholder.shared[((threadIdx.z*36) 14)]))compute[13] (compute[13] (pad_temp.shared[(threadIdx.x 110)]*placeholder.shared[((threadIdx.z*36) 14)]))compute[14] (compute[14] (pad_temp.shared[(threadIdx.x 128)]*placeholder.shared[((threadIdx.z*36) 14)]))compute[15] (compute[15] (pad_temp.shared[(threadIdx.x 146)]*placeholder.shared[((threadIdx.z*36) 14)]))compute[16] (compute[16] (pad_temp.shared[(threadIdx.x 20)]*placeholder.shared[((threadIdx.z*36) 23)]))compute[17] (compute[17] (pad_temp.shared[(threadIdx.x 38)]*placeholder.shared[((threadIdx.z*36) 23)]))compute[18] (compute[18] (pad_temp.shared[(threadIdx.x 56)]*placeholder.shared[((threadIdx.z*36) 23)]))compute[19] (compute[19] (pad_temp.shared[(threadIdx.x 74)]*placeholder.shared[((threadIdx.z*36) 23)]))compute[20] (compute[20] (pad_temp.shared[(threadIdx.x 92)]*placeholder.shared[((threadIdx.z*36) 23)]))compute[21] (compute[21] (pad_temp.shared[(threadIdx.x 110)]*placeholder.shared[((threadIdx.z*36) 23)]))compute[22] (compute[22] (pad_temp.shared[(threadIdx.x 128)]*placeholder.shared[((threadIdx.z*36) 23)]))compute[23] (compute[23] (pad_temp.shared[(threadIdx.x 146)]*placeholder.shared[((threadIdx.z*36) 23)]))compute[24] (compute[24] (pad_temp.shared[(threadIdx.x 20)]*placeholder.shared[((threadIdx.z*36) 32)]))compute[25] (compute[25] (pad_temp.shared[(threadIdx.x 38)]*placeholder.shared[((threadIdx.z*36) 32)]))compute[26] (compute[26] (pad_temp.shared[(threadIdx.x 56)]*placeholder.shared[((threadIdx.z*36) 32)]))compute[27] (compute[27] (pad_temp.shared[(threadIdx.x 74)]*placeholder.shared[((threadIdx.z*36) 32)]))compute[28] (compute[28] (pad_temp.shared[(threadIdx.x 92)]*placeholder.shared[((threadIdx.z*36) 32)]))compute[29] (compute[29] (pad_temp.shared[(threadIdx.x 110)]*placeholder.shared[((threadIdx.z*36) 32)]))compute[30] (compute[30] (pad_temp.shared[(threadIdx.x 128)]*placeholder.shared[((threadIdx.z*36) 32)]))compute[31] (compute[31] (pad_temp.shared[(threadIdx.x 146)]*placeholder.shared[((threadIdx.z*36) 32)]))compute[0] (compute[0] (pad_temp.shared[(threadIdx.x 36)]*placeholder.shared[((threadIdx.z*36) 6)]))compute[1] (compute[1] (pad_temp.shared[(threadIdx.x 54)]*placeholder.shared[((threadIdx.z*36) 6)]))compute[2] (compute[2] (pad_temp.shared[(threadIdx.x 72)]*placeholder.shared[((threadIdx.z*36) 6)]))compute[3] (compute[3] (pad_temp.shared[(threadIdx.x 90)]*placeholder.shared[((threadIdx.z*36) 6)]))compute[4] (compute[4] (pad_temp.shared[(threadIdx.x 108)]*placeholder.shared[((threadIdx.z*36) 6)]))compute[5] (compute[5] (pad_temp.shared[(threadIdx.x 126)]*placeholder.shared[((threadIdx.z*36) 6)]))compute[6] (compute[6] (pad_temp.shared[(threadIdx.x 144)]*placeholder.shared[((threadIdx.z*36) 6)]))compute[7] (compute[7] (pad_temp.shared[(threadIdx.x 162)]*placeholder.shared[((threadIdx.z*36) 6)]))compute[8] (compute[8] (pad_temp.shared[(threadIdx.x 36)]*placeholder.shared[((threadIdx.z*36) 15)]))compute[9] (compute[9] (pad_temp.shared[(threadIdx.x 54)]*placeholder.shared[((threadIdx.z*36) 15)]))compute[10] (compute[10] (pad_temp.shared[(threadIdx.x 72)]*placeholder.shared[((threadIdx.z*36) 15)]))compute[11] (compute[11] (pad_temp.shared[(threadIdx.x 90)]*placeholder.shared[((threadIdx.z*36) 15)]))compute[12] (compute[12] (pad_temp.shared[(threadIdx.x 108)]*placeholder.shared[((threadIdx.z*36) 15)]))compute[13] (compute[13] (pad_temp.shared[(threadIdx.x 126)]*placeholder.shared[((threadIdx.z*36) 15)]))compute[14] (compute[14] (pad_temp.shared[(threadIdx.x 144)]*placeholder.shared[((threadIdx.z*36) 15)]))compute[15] (compute[15] (pad_temp.shared[(threadIdx.x 162)]*placeholder.shared[((threadIdx.z*36) 15)]))compute[16] (compute[16] (pad_temp.shared[(threadIdx.x 36)]*placeholder.shared[((threadIdx.z*36) 24)]))compute[17] (compute[17] (pad_temp.shared[(threadIdx.x 54)]*placeholder.shared[((threadIdx.z*36) 24)]))compute[18] (compute[18] (pad_temp.shared[(threadIdx.x 72)]*placeholder.shared[((threadIdx.z*36) 24)]))compute[19] (compute[19] (pad_temp.shared[(threadIdx.x 90)]*placeholder.shared[((threadIdx.z*36) 24)]))compute[20] (compute[20] (pad_temp.shared[(threadIdx.x 108)]*placeholder.shared[((threadIdx.z*36) 24)]))compute[21] (compute[21] (pad_temp.shared[(threadIdx.x 126)]*placeholder.shared[((threadIdx.z*36) 24)]))compute[22] (compute[22] (pad_temp.shared[(threadIdx.x 144)]*placeholder.shared[((threadIdx.z*36) 24)]))compute[23] (compute[23] (pad_temp.shared[(threadIdx.x 162)]*placeholder.shared[((threadIdx.z*36) 24)]))compute[24] (compute[24] (pad_temp.shared[(threadIdx.x 36)]*placeholder.shared[((threadIdx.z*36) 33)]))compute[25] (compute[25] (pad_temp.shared[(threadIdx.x 54)]*placeholder.shared[((threadIdx.z*36) 33)]))compute[26] (compute[26] (pad_temp.shared[(threadIdx.x 72)]*placeholder.shared[((threadIdx.z*36) 33)]))compute[27] (compute[27] (pad_temp.shared[(threadIdx.x 90)]*placeholder.shared[((threadIdx.z*36) 33)]))compute[28] (compute[28] (pad_temp.shared[(threadIdx.x 108)]*placeholder.shared[((threadIdx.z*36) 33)]))compute[29] (compute[29] (pad_temp.shared[(threadIdx.x 126)]*placeholder.shared[((threadIdx.z*36) 33)]))compute[30] (compute[30] (pad_temp.shared[(threadIdx.x 144)]*placeholder.shared[((threadIdx.z*36) 33)]))compute[31] (compute[31] (pad_temp.shared[(threadIdx.x 162)]*placeholder.shared[((threadIdx.z*36) 33)]))compute[0] (compute[0] (pad_temp.shared[(threadIdx.x 37)]*placeholder.shared[((threadIdx.z*36) 7)]))compute[1] (compute[1] (pad_temp.shared[(threadIdx.x 55)]*placeholder.shared[((threadIdx.z*36) 7)]))compute[2] (compute[2] (pad_temp.shared[(threadIdx.x 73)]*placeholder.shared[((threadIdx.z*36) 7)]))compute[3] (compute[3] (pad_temp.shared[(threadIdx.x 91)]*placeholder.shared[((threadIdx.z*36) 7)]))compute[4] (compute[4] (pad_temp.shared[(threadIdx.x 109)]*placeholder.shared[((threadIdx.z*36) 7)]))compute[5] (compute[5] (pad_temp.shared[(threadIdx.x 127)]*placeholder.shared[((threadIdx.z*36) 7)]))compute[6] (compute[6] (pad_temp.shared[(threadIdx.x 145)]*placeholder.shared[((threadIdx.z*36) 7)]))compute[7] (compute[7] (pad_temp.shared[(threadIdx.x 163)]*placeholder.shared[((threadIdx.z*36) 7)]))compute[8] (compute[8] (pad_temp.shared[(threadIdx.x 37)]*placeholder.shared[((threadIdx.z*36) 16)]))compute[9] (compute[9] (pad_temp.shared[(threadIdx.x 55)]*placeholder.shared[((threadIdx.z*36) 16)]))compute[10] (compute[10] (pad_temp.shared[(threadIdx.x 73)]*placeholder.shared[((threadIdx.z*36) 16)]))compute[11] (compute[11] (pad_temp.shared[(threadIdx.x 91)]*placeholder.shared[((threadIdx.z*36) 16)]))compute[12] (compute[12] (pad_temp.shared[(threadIdx.x 109)]*placeholder.shared[((threadIdx.z*36) 16)]))compute[13] (compute[13] (pad_temp.shared[(threadIdx.x 127)]*placeholder.shared[((threadIdx.z*36) 16)]))compute[14] (compute[14] (pad_temp.shared[(threadIdx.x 145)]*placeholder.shared[((threadIdx.z*36) 16)]))compute[15] (compute[15] (pad_temp.shared[(threadIdx.x 163)]*placeholder.shared[((threadIdx.z*36) 16)]))compute[16] (compute[16] (pad_temp.shared[(threadIdx.x 37)]*placeholder.shared[((threadIdx.z*36) 25)]))compute[17] (compute[17] (pad_temp.shared[(threadIdx.x 55)]*placeholder.shared[((threadIdx.z*36) 25)]))compute[18] (compute[18] (pad_temp.shared[(threadIdx.x 73)]*placeholder.shared[((threadIdx.z*36) 25)]))compute[19] (compute[19] (pad_temp.shared[(threadIdx.x 91)]*placeholder.shared[((threadIdx.z*36) 25)]))compute[20] (compute[20] (pad_temp.shared[(threadIdx.x 109)]*placeholder.shared[((threadIdx.z*36) 25)]))compute[21] (compute[21] (pad_temp.shared[(threadIdx.x 127)]*placeholder.shared[((threadIdx.z*36) 25)]))compute[22] (compute[22] (pad_temp.shared[(threadIdx.x 145)]*placeholder.shared[((threadIdx.z*36) 25)]))compute[23] (compute[23] (pad_temp.shared[(threadIdx.x 163)]*placeholder.shared[((threadIdx.z*36) 25)]))compute[24] (compute[24] (pad_temp.shared[(threadIdx.x 37)]*placeholder.shared[((threadIdx.z*36) 34)]))compute[25] (compute[25] (pad_temp.shared[(threadIdx.x 55)]*placeholder.shared[((threadIdx.z*36) 34)]))compute[26] (compute[26] (pad_temp.shared[(threadIdx.x 73)]*placeholder.shared[((threadIdx.z*36) 34)]))compute[27] (compute[27] (pad_temp.shared[(threadIdx.x 91)]*placeholder.shared[((threadIdx.z*36) 34)]))compute[28] (compute[28] (pad_temp.shared[(threadIdx.x 109)]*placeholder.shared[((threadIdx.z*36) 34)]))compute[29] (compute[29] (pad_temp.shared[(threadIdx.x 127)]*placeholder.shared[((threadIdx.z*36) 34)]))compute[30] (compute[30] (pad_temp.shared[(threadIdx.x 145)]*placeholder.shared[((threadIdx.z*36) 34)]))compute[31] (compute[31] (pad_temp.shared[(threadIdx.x 163)]*placeholder.shared[((threadIdx.z*36) 34)]))compute[0] (compute[0] (pad_temp.shared[(threadIdx.x 38)]*placeholder.shared[((threadIdx.z*36) 8)]))compute[1] (compute[1] (pad_temp.shared[(threadIdx.x 56)]*placeholder.shared[((threadIdx.z*36) 8)]))compute[2] (compute[2] (pad_temp.shared[(threadIdx.x 74)]*placeholder.shared[((threadIdx.z*36) 8)]))compute[3] (compute[3] (pad_temp.shared[(threadIdx.x 92)]*placeholder.shared[((threadIdx.z*36) 8)]))compute[4] (compute[4] (pad_temp.shared[(threadIdx.x 110)]*placeholder.shared[((threadIdx.z*36) 8)]))compute[5] (compute[5] (pad_temp.shared[(threadIdx.x 128)]*placeholder.shared[((threadIdx.z*36) 8)]))compute[6] (compute[6] (pad_temp.shared[(threadIdx.x 146)]*placeholder.shared[((threadIdx.z*36) 8)]))compute[7] (compute[7] (pad_temp.shared[(threadIdx.x 164)]*placeholder.shared[((threadIdx.z*36) 8)]))compute[8] (compute[8] (pad_temp.shared[(threadIdx.x 38)]*placeholder.shared[((threadIdx.z*36) 17)]))compute[9] (compute[9] (pad_temp.shared[(threadIdx.x 56)]*placeholder.shared[((threadIdx.z*36) 17)]))compute[10] (compute[10] (pad_temp.shared[(threadIdx.x 74)]*placeholder.shared[((threadIdx.z*36) 17)]))compute[11] (compute[11] (pad_temp.shared[(threadIdx.x 92)]*placeholder.shared[((threadIdx.z*36) 17)]))compute[12] (compute[12] (pad_temp.shared[(threadIdx.x 110)]*placeholder.shared[((threadIdx.z*36) 17)]))compute[13] (compute[13] (pad_temp.shared[(threadIdx.x 128)]*placeholder.shared[((threadIdx.z*36) 17)]))compute[14] (compute[14] (pad_temp.shared[(threadIdx.x 146)]*placeholder.shared[((threadIdx.z*36) 17)]))compute[15] (compute[15] (pad_temp.shared[(threadIdx.x 164)]*placeholder.shared[((threadIdx.z*36) 17)]))compute[16] (compute[16] (pad_temp.shared[(threadIdx.x 38)]*placeholder.shared[((threadIdx.z*36) 26)]))compute[17] (compute[17] (pad_temp.shared[(threadIdx.x 56)]*placeholder.shared[((threadIdx.z*36) 26)]))compute[18] (compute[18] (pad_temp.shared[(threadIdx.x 74)]*placeholder.shared[((threadIdx.z*36) 26)]))compute[19] (compute[19] (pad_temp.shared[(threadIdx.x 92)]*placeholder.shared[((threadIdx.z*36) 26)]))compute[20] (compute[20] (pad_temp.shared[(threadIdx.x 110)]*placeholder.shared[((threadIdx.z*36) 26)]))compute[21] (compute[21] (pad_temp.shared[(threadIdx.x 128)]*placeholder.shared[((threadIdx.z*36) 26)]))compute[22] (compute[22] (pad_temp.shared[(threadIdx.x 146)]*placeholder.shared[((threadIdx.z*36) 26)]))compute[23] (compute[23] (pad_temp.shared[(threadIdx.x 164)]*placeholder.shared[((threadIdx.z*36) 26)]))compute[24] (compute[24] (pad_temp.shared[(threadIdx.x 38)]*placeholder.shared[((threadIdx.z*36) 35)]))compute[25] (compute[25] (pad_temp.shared[(threadIdx.x 56)]*placeholder.shared[((threadIdx.z*36) 35)]))compute[26] (compute[26] (pad_temp.shared[(threadIdx.x 74)]*placeholder.shared[((threadIdx.z*36) 35)]))compute[27] (compute[27] (pad_temp.shared[(threadIdx.x 92)]*placeholder.shared[((threadIdx.z*36) 35)]))compute[28] (compute[28] (pad_temp.shared[(threadIdx.x 110)]*placeholder.shared[((threadIdx.z*36) 35)]))compute[29] (compute[29] (pad_temp.shared[(threadIdx.x 128)]*placeholder.shared[((threadIdx.z*36) 35)]))compute[30] (compute[30] (pad_temp.shared[(threadIdx.x 146)]*placeholder.shared[((threadIdx.z*36) 35)]))compute[31] (compute[31] (pad_temp.shared[(threadIdx.x 164)]*placeholder.shared[((threadIdx.z*36) 35)]))}}tensor[(((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x)] max(((compute[0]*placeholder[(threadIdx.z*4)]) placeholder[(threadIdx.z*4)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 224)] max(((compute[1]*placeholder[(threadIdx.z*4)]) placeholder[(threadIdx.z*4)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 448)] max(((compute[2]*placeholder[(threadIdx.z*4)]) placeholder[(threadIdx.z*4)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 672)] max(((compute[3]*placeholder[(threadIdx.z*4)]) placeholder[(threadIdx.z*4)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 896)] max(((compute[4]*placeholder[(threadIdx.z*4)]) placeholder[(threadIdx.z*4)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 1120)] max(((compute[5]*placeholder[(threadIdx.z*4)]) placeholder[(threadIdx.z*4)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 1344)] max(((compute[6]*placeholder[(threadIdx.z*4)]) placeholder[(threadIdx.z*4)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 1568)] max(((compute[7]*placeholder[(threadIdx.z*4)]) placeholder[(threadIdx.z*4)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 50176)] max(((compute[8]*placeholder[((threadIdx.z*4) 1)]) placeholder[((threadIdx.z*4) 1)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 50400)] max(((compute[9]*placeholder[((threadIdx.z*4) 1)]) placeholder[((threadIdx.z*4) 1)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 50624)] max(((compute[10]*placeholder[((threadIdx.z*4) 1)]) placeholder[((threadIdx.z*4) 1)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 50848)] max(((compute[11]*placeholder[((threadIdx.z*4) 1)]) placeholder[((threadIdx.z*4) 1)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 51072)] max(((compute[12]*placeholder[((threadIdx.z*4) 1)]) placeholder[((threadIdx.z*4) 1)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 51296)] max(((compute[13]*placeholder[((threadIdx.z*4) 1)]) placeholder[((threadIdx.z*4) 1)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 51520)] max(((compute[14]*placeholder[((threadIdx.z*4) 1)]) placeholder[((threadIdx.z*4) 1)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 51744)] max(((compute[15]*placeholder[((threadIdx.z*4) 1)]) placeholder[((threadIdx.z*4) 1)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 100352)] max(((compute[16]*placeholder[((threadIdx.z*4) 2)]) placeholder[((threadIdx.z*4) 2)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 100576)] max(((compute[17]*placeholder[((threadIdx.z*4) 2)]) placeholder[((threadIdx.z*4) 2)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 100800)] max(((compute[18]*placeholder[((threadIdx.z*4) 2)]) placeholder[((threadIdx.z*4) 2)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 101024)] max(((compute[19]*placeholder[((threadIdx.z*4) 2)]) placeholder[((threadIdx.z*4) 2)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 101248)] max(((compute[20]*placeholder[((threadIdx.z*4) 2)]) placeholder[((threadIdx.z*4) 2)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 101472)] max(((compute[21]*placeholder[((threadIdx.z*4) 2)]) placeholder[((threadIdx.z*4) 2)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 101696)] max(((compute[22]*placeholder[((threadIdx.z*4) 2)]) placeholder[((threadIdx.z*4) 2)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 101920)] max(((compute[23]*placeholder[((threadIdx.z*4) 2)]) placeholder[((threadIdx.z*4) 2)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 150528)] max(((compute[24]*placeholder[((threadIdx.z*4) 3)]) placeholder[((threadIdx.z*4) 3)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 150752)] max(((compute[25]*placeholder[((threadIdx.z*4) 3)]) placeholder[((threadIdx.z*4) 3)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 150976)] max(((compute[26]*placeholder[((threadIdx.z*4) 3)]) placeholder[((threadIdx.z*4) 3)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 151200)] max(((compute[27]*placeholder[((threadIdx.z*4) 3)]) placeholder[((threadIdx.z*4) 3)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 151424)] max(((compute[28]*placeholder[((threadIdx.z*4) 3)]) placeholder[((threadIdx.z*4) 3)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 151648)] max(((compute[29]*placeholder[((threadIdx.z*4) 3)]) placeholder[((threadIdx.z*4) 3)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 151872)] max(((compute[30]*placeholder[((threadIdx.z*4) 3)]) placeholder[((threadIdx.z*4) 3)]), 0.000000f)tensor[((((((blockIdx.y*112) blockIdx.x) (threadIdx.z*12544))*16) threadIdx.x) 152096)] max(((compute[31]*placeholder[((threadIdx.z*4) 3)]) placeholder[((threadIdx.z*4) 3)]), 0.000000f)
}将 cuDNN 用于卷积层
将选项 “-libscudnn” 附加到 target 字符串从而用 cuDNN 将卷积核替换为 cuDNN。
net, params testing.create_workload(simple_net)
target cuda -libscudnn # use cudnn for convolution
lib relay.build_module.build(net, target, paramsparams)dev tvm.device(target, 0)
data np.random.uniform(-1, 1, sizedata_shape).astype(float32)
module runtime.GraphModule(lib[default](dev))
module.set_input(data, data)
module.run()
out_shape (batch_size, out_channels, 224, 224)
out module.get_output(0, tvm.nd.empty(out_shape))
out_cudnn out.numpy()输出结果
/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, hosttarget_host) instead.target_host parameter is going to be deprecated. 注意若用 cuDNNRelay 无法将卷积与其后面的层融合。因为层融合发生在 TVM internal representationIR级别。 Relay 将外部库视为黑盒因此无法将它们与 TVM IR 融合。
下面的伪代码显示了 cuDNN 卷积 bias add batch norm ReLU 变成了两个计算阶段一个用于 cuDNN 调用另一个用于其余操作。
// attr [y] storage_scope global
allocate y[float32 * 802816]
produce y {// attr [0] extern_scope 0tvm_call_packed(tvm.contrib.cudnn.conv2d.forward, 1, 0, 1, 1, 1, 1, 1, 1, 1, tvm_stack_make_array(placeholder, tvm_stack_make_shape(1, 3, 224, 224), 0, 4, 0.000000f, 0), tvm_stack_make_array(placeholder, tvm_stack_make_shape(16, 3, 3, 3), 0, 4, 0.000000f, 0), tvm_stack_make_array(y, tvm_stack_make_shape(1, 16, 224, 224), 0, 4, 0.000000f, 0))
}
produce tensor {// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent 256// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent 512for (ax0.ax1.fused.ax2.fused.ax3.fused.outer, 0, 7) {if (likely(((blockIdx.x*512) ((802816 - (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072)) - threadIdx.x)))) {tensor[(((((((blockIdx.x*512) threadIdx.x) (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/802816)*802816) (((((((blockIdx.x*512) threadIdx.x) (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/224) % 224)*224) ((((blockIdx.x*64) threadIdx.x) (ax0.ax1.fused.ax2.fused.ax3.fused.outer*32)) % 224))) ((((((blockIdx.x*512) threadIdx.x) (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)*50176))] max(((y[(((((((blockIdx.x*512) threadIdx.x) (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/802816)*802816) (((((((blockIdx.x*512) threadIdx.x) (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/224) % 224)*224) ((((blockIdx.x*64) threadIdx.x) (ax0.ax1.fused.ax2.fused.ax3.fused.outer*32)) % 224))) ((((((blockIdx.x*512) threadIdx.x) (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)*50176))]*placeholder[(((((blockIdx.x*512) threadIdx.x) (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)]) placeholder[(((((blockIdx.x*512) threadIdx.x) (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)]), 0.000000f)}}
}验证结果
检查两次运行的结果是否匹配。
tvm.testing.assert_allclose(out_cuda, out_cudnn, rtol1e-5)结论
本教程介绍了 cuDNN 与 Relay 的使用此外还支持 cuBLAS。若启用了 cuBLAS它将在全连接层relay.dense内使用。若要用 cuBLAS请将 target 字符串设置为 “cuda -libscublas”。也可以将 cuDNN 和 cuBLAS 与 “cuda -libscudnn,cublas” 一起使用。
对于 ROCm 后端支持 MIOpen 和 rocBLAS。将 target 设置为 “rocm -libsmiopen,rocblas” 以启用它们。
使用外部库的注意事项
首先使用外部库可能会限制 TVM 和 Relay 的使用。例如MIOpen 目前只支持 NCHW 布局和 fp32 数据类型因此不能在 TVM 中使用其他布局或数据类型。
其次外部库限制了计算图编译期间算子融合的可能性如上所示。TVM 和 Relay 旨在通过联合算子级别和计算图级别优化在各种硬件上实现最佳性能。为了实现这个目标应该继续为 TVM 和 Relay 开发更好的优化同时在必要时使用外部库回退到现有实现。
下载 Python 源代码using_external_lib.py
下载 Jupyter Notebookusing_external_lib.ipynb