用于cpu调试ascendc编写的算子
Project description
1 功能描述
由于在ascendc算子开发过程中运行算子比较复杂,为了简化算子的运行,将运行算子变成可以用python直接调用的函数。所以编写了此代码。
2 安装
pip install l0n0lacltester
3 运行算子实例
3.1 先切换到cann环境,比如我的环境是:
source /home/HwHiAiUser/Ascend/ascend-toolkit/set_env.sh
4 创建测试用例工程
4.1 命令行参数
l0n0lacltester -h
usage: l0n0lacltester [-h] op_path test_path
创建测试工程
positional arguments:
op_path ascendc算子目录
test_path 测试工程目录
optional arguments:
-h, --help show this help message and exit
4.2 举例
l0n0lacltester src/03_NLLLoss tests/NLLLoss
4.2.1 工程结构:
cmake
- cpu_lib.cmake
- npu_lib.cmake
include
- *.h
.gitignore
CMakeLists.txt
gen_code.py
run.py
run.sh
tiling_context.cpp
上面需要关注的只有gen_code.py 与 run.py
4.2.2 算子工程设置
4.2.2.1 设置tiling namespace
默认情况下的optiling如下
namespace optiling {
static ge::graphStatus TilingFunc(gert::TilingContext *context) {
return ge::GRAPH_SUCCESS;
}
}
由于本工具是用python的re模块正则表达式匹配的,所以需要在
在namespace末尾添加 // namespace optiling
namespace optiling {
static ge::graphStatus TilingFunc(gert::TilingContext *context) {
return ge::GRAPH_SUCCESS;
}
} // namespace optiling
4.3 gen_code.py
import os
from l0n0lacltester.gen_cpu_call_code import generate_all_codes
current_path = os.path.split(__file__)[0]
# 比如['/mnt/code/a','/mnt/code/b']
include_dirs=[
]
# enum class KernelMode {
# MIX_MODE = 0,
# AIC_MODE,
# AIV_MODE,
# MIX_AIC_1_1,
# };
CPU_KERNEL_MODE='KernelMode::AIV_MODE'
generate_all_codes(f'{current_path}/../../src/03_NLLLoss', '.', include_dirs, CPU_KERNEL_MODE)
需要关注的是include_dirs 与generate_all_codes的第一个参数
4.3.1 include_dirs
如果算子工程使用了算子工程目录之外的.h
文件。则需要将该include目录绝对地址
写到include_dirs中
比如
include_dirs=[
'/mnt/code/a',
'/mnt/code/b'
]
4.3.2 CPU_KERNEL_MODE
仅在cpu模式下情况下起效 可选项有
enum class KernelMode {
MIX_MODE = 0, # 融合模式,启用向量运算单元(aiv)与矩阵运算单元(aic)。一个block一个aic, n个aiv(n >= 1)。
AIC_MODE, # 仅使用矩阵运算单元(aic)。一个block仅包含一个aic。
AIV_MODE, # 仅使用向量运算单元(aiv)。一个block仅包含一个aiv。
MIX_AIC_1_1, # 矩阵运算单元(aic)与向量运算单元(aiv) 1:1合并为一个block。 一个block包含一个aiv一个aic。
};
4.3.3 generate_all_codes
generate_all_codes用于生成cpu|sim运行模式所需要的代码。
generate_all_codes的第一个参数是算子工程的绝对地址
比如
generate_all_codes(f'{current_path}/../../src/03_NLLLoss', '.', include_dirs, CPU_KERNEL_MODE)
current_path表示gen_code.py
所在的目录
4.4 run.py
初始情况下
import sys
import numpy as np
import l0n0lacltester as tester
from op_args import AscendCOpArgs
if sys.argv[1] == 'cpu' or sys.argv[1] == 'sim':
from op_cpu import AscendCOp
else:
from op_npu import AscendCOp
4.4.1 AscendCOp
可以用于调用算子
b = 8
c = 32
ignore_index = -100
reduction='sum'
x_shape = [b, c]
target_shape = [b]
weight_shape = [c]
input_x = np.random.uniform(-5, 5, x_shape).astype(np.float32)
input_target = np.random.uniform(0, 31, target_shape).astype(np.int32)
input_weight = np.random.uniform(0, 1, weight_shape).astype(np.float32)
y = np.random.uniform(0, 1, [1]).astype(np.float32)
op = AscendCOp(reduction, ignore_index)
op(input_x, input_target, input_weight, y)
print('y = ', y)
输出:
x_memsize = 1024
target_memsize = 32
weight_memsize = 128
y_memsize = 4
workspace_memsize = 0
tiling_memsize = 1048576
[TmSim]: Run in serial mode.
[SUCCESS][CORE_0][pid 64322] exit success!
y= [-1.6584146]
4.4.2 AscendCOpArgs
用于保存参数,并且可以用于调用AscendCOp
基本范式为:
# 创建测试用例
args = AscendCOpArgs(‘保存文件.json’)
# 尝试读取 '保存文件.json'
if not args.try_load():
# 生成测试数据
pass
# 调用算子
args.run_op(AscendCOp)
# 检测精度
if 精度检测通过:
tester.print_green("成功")
else:
tester.print_red("失败")
# 将测试数据保存到 '保存文件.json'
args.save()
举例
import sys
import torch
import numpy as np
import l0n0lacltester as tester
from op_args import AscendCOpArgs
if sys.argv[1] == 'cpu' or sys.argv[1] == 'sim':
from op_cpu import AscendCOp
else:
from op_npu import AscendCOp
def test(
name: str, b: int, c: int, d: int, reduction: str, ignore_index: int):
if c == 1 and d == 1:
x_shape = [b]
target_shape = [1]
weight_shape = [1]
elif d == 1:
x_shape = [b, c]
target_shape = [b]
weight_shape = [c]
else:
x_shape = [b, c, d]
target_shape = [b, d]
weight_shape = [c]
args = AscendCOpArgs(name)
if not args.try_load():
input_x = np.random.uniform(-5, 5, x_shape).astype(np.float32)
input_target = np.random.uniform(0, 31, target_shape).astype(np.int32)
input_weight = np.random.uniform(0, 1, weight_shape).astype(np.float32)
golden = torch.nn.functional.nll_loss(
torch.Tensor(input_x), torch.Tensor(input_target).to(
torch.int64), torch.Tensor(input_weight),
reduction=reduction, ignore_index=ignore_index)
y = torch.ones_like(golden, dtype=torch.float32)
args.set_ignore_index(ignore_index)
args.set_reduction(reduction)
args.set_x(input_x)
args.set_target(input_target)
args.set_weight(input_weight)
args.set_y(y.numpy())
args.set_golden(golden.numpy())
args.run_op(AscendCOp)
output = torch.tensor(args.get_y())
golden = torch.tensor(args.get_golden())
print(output)
print(golden)
if torch.allclose(output, golden, 1e-4, 1e-4):
tester.print_green('成功')
args.remove_record()
else:
tester.print_red("失败")
args.save()
test("dim2_2.json", 800, 32, 1, 'sum', -100)
4.3 关于COMMON_TILING宏定义
COMMON_TILING
宏定义是用于在tiling结构定义时,复用某些结构用的
范式为:
#define COMMON_TILING_XXX(arg) \
...
// END COMMON_TILING_XXX
- 注意
// END COMMON_TILING_XXX
是必须的。用于正则表达式匹配
比如我有一个关于tiling
的宏定义如下
#define COMMON_TILING_FILED_DEF(prefix) \
TILING_DATA_FIELD_DEF(int64_t, prefix##TileLength); \
TILING_DATA_FIELD_DEF(int64_t, prefix##FormerNum); \
TILING_DATA_FIELD_DEF(int64_t, prefix##FormerLength); \
TILING_DATA_FIELD_DEF(int64_t, prefix##FormerFinalCalcCount); \
TILING_DATA_FIELD_DEF(int64_t, prefix##TailLength); \
TILING_DATA_FIELD_DEF(int64_t, prefix##TailFinalCalcCount); \
TILING_DATA_FIELD_DEF(int64_t, prefix##FinalKernelFinalCalcCount); \
TILING_DATA_FIELD_DEF(int64_t, prefix##KernelCount);
// END COMMON_TILING_FILED_DEF
tiling.h就可以使用它了
#include "register/tilingdata_base.h"
#include "tiling_defines.h"
namespace optiling {
BEGIN_TILING_DATA_DEF(NLLLossTilingData)
TILING_DATA_FIELD_DEF(uint64_t, b);
TILING_DATA_FIELD_DEF(uint64_t, c);
TILING_DATA_FIELD_DEF(uint64_t, d);
TILING_DATA_FIELD_DEF(int64_t, reduction);
TILING_DATA_FIELD_DEF(int64_t, ignore_index);
COMMON_TILING_FILED_DEF(b);
TILING_DATA_FIELD_DEF(int32_t, dimFlag);
END_TILING_DATA_DEF;
REGISTER_TILING_DATA_CLASS(NLLLoss, NLLLossTilingData)
} // namespace optiling
5 运行
# bash run.sh -h
run.sh [option]
-v 芯片型号(默认Ascend910B1)
-r 运行模式(cpu[默认]|sim|npu)
-n 对于cpu|sim模式不重新编译代码
-h 显示此帮助
实例
默认为 Ascend910B1
cpu
模式
bash run.sh
bash run.sh -v Ascend910B1 -r cpu
Project details
Release history Release notifications | RSS feed
Download files
Download the file for your platform. If you're not sure which to choose, learn more about installing packages.
Source Distribution
Built Distribution
File details
Details for the file l0n0lacltester-1.0.1.tar.gz
.
File metadata
- Download URL: l0n0lacltester-1.0.1.tar.gz
- Upload date:
- Size: 31.8 kB
- Tags: Source
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/5.1.1 CPython/3.12.4
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | abbaf259cdabc9e45b99f84a7e285130303e74766b0557f3fb1ad8e3ae9ca60c |
|
MD5 | a5f7f0e81136de1428264b1773143d30 |
|
BLAKE2b-256 | cecfe836221e53875bc9043076c50dfae51dfef0b3e719970116c49c19818365 |
File details
Details for the file l0n0lacltester-1.0.1-py3-none-any.whl
.
File metadata
- Download URL: l0n0lacltester-1.0.1-py3-none-any.whl
- Upload date:
- Size: 32.8 kB
- Tags: Python 3
- Uploaded using Trusted Publishing? No
- Uploaded via: twine/5.1.1 CPython/3.12.4
File hashes
Algorithm | Hash digest | |
---|---|---|
SHA256 | 671a9f4d7e2e34f5100d9a195a111e007df6384e2a50a4ed6b615f95272f4aa8 |
|
MD5 | 12640972153110199d700b5926bbc110 |
|
BLAKE2b-256 | ecc14f002cc087a898124a47e74f7c2aeb7d65c4d7b30e105c3ecd3ea132e734 |