当前位置：网站首页>Demo of intelligent computing system 2 bangc operator development (heterogeneous programming flow of CPU and mlu270)

Demo of intelligent computing system 2 bangc operator development (heterogeneous programming flow of CPU and mlu270)

2022-06-28 19:24:00 【No regrets!】

List of articles

1. First load the environment
2. Code

This paper mainly introduces how to use Cambrian bangc Language , namely cnrt.h library , Realization CPU And MLU Heterogeneous programming process

This article independent code gitee in
Another experiment is the latter bangc In Experiment 1 powerdifference

This is an intelligent computing system 259 Page accumulation example implementation

The whole process is shown in the figure below

Insert picture description here

1. First load the environment

Environmental location
/home/zjq/AICSE-demo-student/env/env.sh

Change the environment content to a relative path

#!/bin/bash
export AICSE_MODELS_MODEL_HOME=/opt/Cambricon-Test/models
export AICSE_MODELS_DATA_HOME=/opt/Cambricon-Test/datasets/
#export AICSE_MODELS_MODEL_HOME=$PWD/../data/models
#export AICSE_MODELS_DATA_HOME=$PWD/../data/data
export NEUWARE=$PWD/neuware
export NEUWARE_HOME=$PWD/neuware
export TENSORFLOW_MODELS_DATA_HOME=$AICSE_MODELS_DATA_HOME
export PATH=$PATH:$NEUWARE/bin
export PATH=$PATH:/usr/local/neuware/bin
unset LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$NEUWARE/lib64
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/neuware/lib64
source /etc/profile

Running environment
source env.sh

For convenience , Execute the following code , You can go through bangcstart Every time I directly enter the environment

echo 'alias bangcstart="pushd /home/zjq/AICSE-demo-student/env/ && source env.sh && pushd"' >> ~/.bashrc
source ~/.bashrc

2. Code

Download code
cd /home/zjq && git clone https://gitee.com/nwu_zjq/cambrian-demo.git
The following code is an example powerDifference
Corresponding to Intelligent computing system Experiment one in it , bangc Operator experiment

Just because the project management inside is a bit chaotic , In order to understand the whole CPU and MLU Heterogeneous programming process , I reorganized the project , utilize Makefile Conduct management , The standard src Put the source code , include Put header file

In fact, the original code of the project is in /home/zjq/AICSE-demo-student/demo/style_transfer_bcl/src/bangc/PluginPowerDifferenceOp/

2.1 Makefile Project management preparation

#  Need to run  pushd /home/zjq/AICSE-demo-student/env/ && source env.sh && pushd

CNCCParams = --bang-mlu-arch=MLU200
GPPParams = -g -std=c++11 -Iinclude -I${NEUWARE_HOME}/include #  The setting here is C++11 standard , include It's a header file  -g It's debugging 
LINKParams = -L ${NEUWARE_HOME}/lib64 -lcnrt

object=	obj/plugin_power_difference_kernel.o obj/plugin_power_difference_op.o obj/main.o

obj/%.o: src/%.mlu
	mkdir -p $(@D)
	cncc ${CNCCParams} -o [email protected] -c $< 

obj/%.o: src/%.cpp
	mkdir -p $(@D)
	g++ ${GPPParams} -o [email protected] -c $< 

#  Note that there is no need to put main Also into .o,  because main Need to go to the other four .o File to find the function 
all: ${object}
	g++ ${object} -o main  ${LINKParams}
	./main


clean: 
	rm -rf obj main

above Makefile The file can realize the operation make Will get results directly

2.2 MLU270 Code to execute

_kernel.mlu What is defined here is how to MLU Apply for memory on , If you will CPU Copy the data on to MLU, And then in MLU Take advantage of care Calculate , After completion, the results will be copied CPU Corresponding memory

// /home/zjq/cambrian-demo/powerDifference/src/plugin_power_difference_kernel.mlu

// TODO：PowerDifference BCL Single core implementation 

#define ONELINE 64
__mlu_entry__ void PowerDifferenceKernel(   half* input1,  // X
                                            half* input2,  // Y
                                            int32_t pow,   // Z
                                            half* output,  //  result 
                                            int32_t dims_a) //  dimension 
{
    
    if (taskId > 0) return;
    __bang_printf(" Total length  %d  Task dimension %d\n", dims_a, taskDim);
    // TODO： Cycle condition judgment 
    int32_t quotient = dims_a/ONELINE; //  Great circulation 
    int32_t rem = dims_a % ONELINE; //  Cycle remaining 
    if(rem != 0) {
    
        quotient+=1; 
    }


    // TODO： Memory application 
    __nram__ half inputx_nram[ONELINE];
    __nram__ half inputy_nram[ONELINE];
    __nram__ half temp_nram[ONELINE];


    // TODO：For Cycle calculation 
    for (int i = 0; i < quotient; i++) {
    

        // TODO： Copy in operation 
        __memcpy(inputx_nram, input1+i*ONELINE, ONELINE*sizeof(half), GDRAM2NRAM);
        __memcpy(inputy_nram, input2+i*ONELINE, ONELINE*sizeof(half), GDRAM2NRAM);

        // TODO： Actual calculation part 
        __bang_sub(temp_nram, inputx_nram, inputy_nram, ONELINE); //  Subtract by line and save to temp
        __bang_active_abs(temp_nram, temp_nram, ONELINE);

        for(int i=0;i<pow-1;i++) {
    
            __bang_mul(temp_nram,temp_nram,temp_nram,ONELINE);
        }
        // TODO： Result copy out operation 
        __memcpy(output+i*ONELINE,temp_nram,ONELINE*sizeof(half),NRAM2GDRAM);
    }
}

Corresponding header file

// /home/zjq/cambrian-demo/powerDifference/include/plugin_power_difference_kernel.h
#ifndef _PLUGIN_POWER_DIFFERENCE_KERNEL_H_
#define _PLUGIN_POWER_DIFFERENCE_KERNEL_H_
#ifdef __cplusplus
extern "C" {
    
#endif

#include <stdlib.h>
#include "cnrt.h" //  call mlu Library function 
#include "cnrt_data.h"
#include "stdio.h"

typedef uint16_t half;

// TODO：BCL Interface definition 
// void PowerDifferenceKernel(---);
void PowerDifferenceKernel(half* input1,half* input2,int32_t pow, half* output,int32_t dims_a);

#ifdef __cplusplus
}
#endif
#endif // _PLUGIN_POWER_DIFFERENCE_KERNEL_H_

2.3 CPU Code scheduling executed on MLU

_op.cc It's done CPU On the memory data development and with MLU Data transfer between , And it's packaged as op Of API, supply main Function call

// /home/zjq/cambrian-demo/powerDifference/src/plugin_power_difference_op.cpp

#include "cnrt.h" //  call mlu Library function 
#include "cnrt_data.h"
#include "plugin_power_difference_kernel.h"
#include "plugin_power_difference_op.h"



int MLUPowerDifferenceOp(float* input1,float* input2, int pow, float*output, int dims_a) {
    
  
  cnrtInit(0); //  Initialization equipment 
  cnrtDev_t dev;
  cnrtGetDeviceHandle(&dev, 0);
  cnrtSetCurrentDevice(dev);
  cnrtQueue_t pQueue;
  cnrtCreateQueue(&pQueue);

  //  Set task division ,
  cnrtDim3_t dim;
  dim.x = 1; //  Here we have a single core ,  If it is dim.x=4,  It is 4 nucleus ,  That is, one line can calculate 64*4=256 position  
  dim.y = 1;
  dim.z = 1;
  float hardware_time = 0.0;
  cnrtNotifier_t event_start;
  cnrtNotifier_t event_end;
  cnrtCreateNotifier(&event_start);
  cnrtCreateNotifier(&event_end);
  cnrtFunctionType_t c = CNRT_FUNC_TYPE_BLOCK;

  //prepare data
  half* input1_half = (half*)malloc(dims_a * sizeof(half));
  half* input2_half = (half*)malloc(dims_a * sizeof(half));
  half* output_half = (half*)malloc(dims_a * sizeof(half));

  cnrtConvertFloatToHalfArray(input1_half, input1, dims_a);
  cnrtConvertFloatToHalfArray(input2_half, input2, dims_a);
  cnrtConvertFloatToHalfArray(output_half, output,dims_a);
 
  half *mlu_input1, *mlu_input2, *mlu_output;
  if (CNRT_RET_SUCCESS != cnrtMalloc((void**)&mlu_input1, dims_a * sizeof(half))) {
    
    printf("cnrtMalloc Failed!\n");
    exit(-1);
  }
  if (CNRT_RET_SUCCESS != cnrtMalloc((void**)&mlu_input2, dims_a * sizeof(half))) {
    
    printf("cnrtMalloc Failed!\n");
    exit(-1);
  }
  if (CNRT_RET_SUCCESS != cnrtMalloc((void**)&mlu_output, dims_a * sizeof(half))) {
    
    printf("cnrtMalloc output Failed!\n");
    exit(-1);
  }
  // TODO： complete cnrtMemcpy Copy in function 
  cnrtMemcpy(mlu_input1,input1_half,dims_a*sizeof(half),CNRT_MEM_TRANS_DIR_HOST2DEV);
  cnrtMemcpy(mlu_input2,input2_half,dims_a*sizeof(half),CNRT_MEM_TRANS_DIR_HOST2DEV);
 
  //kernel parameters
  cnrtKernelParamsBuffer_t params;
  cnrtGetKernelParamsBuffer(&params);
  cnrtKernelParamsBufferAddParam(params, &mlu_input1, sizeof(half*)); 
  cnrtKernelParamsBufferAddParam(params, &mlu_input2, sizeof(half*)); 
  cnrtKernelParamsBufferAddParam(params, &pow, sizeof(int));
  cnrtKernelParamsBufferAddParam(params, &mlu_output, sizeof(half*)); 
  cnrtKernelParamsBufferAddParam(params, &dims_a, sizeof(int)); 
  cnrtPlaceNotifier(event_start, pQueue);

  // TODO： complete cnrtInvokeKernel function 
  cnrtInvokeKernel_V2((void*)&PowerDifferenceKernel,dim,params,c,pQueue); if (CNRT_RET_SUCCESS != cnrtSyncQueue(pQueue))
  {
    
    printf("syncQueue Failed!\n");
    exit(-1);
  }
  cnrtPlaceNotifier(event_end, pQueue);
  
  //get output data
  // TODO： complete cnrtMemcpy Copy out the function 
  cnrtMemcpy(output_half,mlu_output,dims_a*sizeof(half),CNRT_MEM_TRANS_DIR_DEV2HOST);

  cnrtConvertHalfToFloatArray(output, output_half,dims_a );

  //free data
  if (CNRT_RET_SUCCESS != cnrtFree(mlu_input1)) {
    
    printf("cnrtFree Failed!\n");
    exit(-1);
  }
  if (CNRT_RET_SUCCESS != cnrtFree(mlu_input2)) {
    
    printf("cnrtFree Failed!\n");
    exit(-1);
  }
  if (CNRT_RET_SUCCESS != cnrtFree(mlu_output)) {
    
    printf("cnrtFree output Failed!\n");
    exit(-1);
  }
  if (CNRT_RET_SUCCESS != cnrtDestroyQueue(pQueue)) {
    
    printf("cnrtDestroyQueue Failed!\n");
    exit(-1);
  }
  if (CNRT_RET_SUCCESS != cnrtDestroyKernelParamsBuffer(params)) {
    
    printf("cnrtDestroyKernelParamsBuffer Failed!\n");
    return -1;
  }
  cnrtDestroy();
  free(input1_half);
  free(input2_half);
  free(output_half);
  return 0;
}

Corresponding header file

// /home/zjq/cambrian-demo/powerDifference/include/plugin_power_difference_op.h
#ifndef _PLUGIN_POWER_DIFFERENCE_OP_H_
#define _PLUGIN_POWER_DIFFERENCE_OP_H_
#ifdef __cplusplus
extern "C" {
    
#endif

#include <stdlib.h>
#include "stdio.h"

int MLUPowerDifferenceOp(float* input1,float* input2, int pow, float*output, int dims_a);

#ifdef __cplusplus
}
#endif
#endif

2.4 The main function

// /home/zjq/cambrian-demo/powerDifference/src/main.cpp
#include <math.h>
#include <time.h>
#include "stdio.h"
#include <stdlib.h>
#include <sys/time.h>
#include "plugin_power_difference_op.h" //  This includes CPU and MLU Interactive logic 

#define DATA_COUNT 32768
#define POW_COUNT 2
// int MLUPowerDifferenceOp(float* input1,float* input2, int pow, float*output, int dims_a);

int main() {
    
  float* input_x = (float*)malloc(DATA_COUNT * sizeof(float));
  float* input_y = (float*)malloc(DATA_COUNT * sizeof(float));
  float* output_data = (float*)malloc(DATA_COUNT * sizeof(float));
  float* output_data_cpu = (float*)malloc(DATA_COUNT * sizeof(float));
  FILE* f_input_x = fopen("./data/in_x.txt", "r");
  FILE* f_input_y = fopen("./data/in_y.txt", "r");
  FILE* f_output_data = fopen("./data/out.txt", "r");
  struct timeval tpend, tpstart;
  float err = 0.0;
  float cpu_sum = 0.0;
  float time_use = 0.0;

  if (f_input_x == NULL|| f_input_y == NULL || f_output_data == NULL) {
    
    printf("Open file fail!\n");
    return 0;
  }

  gettimeofday(&tpstart, NULL);
  srand((unsigned)time(NULL));
  for (int i = 0; i < DATA_COUNT; i++) {
    
    fscanf(f_input_x, "%f\n", &input_x[i]);
    fscanf(f_input_y, "%f\n", &input_y[i]);
    fscanf(f_output_data, "%f\n", &output_data_cpu[i]);
  }
  gettimeofday(&tpend, NULL);
  time_use = 1000000 * (tpend.tv_sec - tpstart.tv_sec)+ tpend.tv_usec - tpstart.tv_usec;
  printf("get data cost time %f ms\n", time_use/1000.0);

  gettimeofday(&tpstart, NULL);
  MLUPowerDifferenceOp(input_x,input_y,POW_COUNT,output_data,DATA_COUNT);
  gettimeofday(&tpend, NULL);
  time_use = 1000000 * (tpend.tv_sec - tpstart.tv_sec)+ tpend.tv_usec - tpstart.tv_usec;
  printf("compute data cost time %f ms\n", time_use/1000.0);
  printf("input x %f\n",input_x[0]);
  printf("input y %f\n",input_y[0]);
  printf("output data %f\n",output_data[0]);
  printf("output data %f\n",output_data[1]);
  printf("output data %f\n",output_data[2]);
  for(int i = 0; i < DATA_COUNT;++i)
  {
    
     err +=fabs(output_data_cpu[i] - output_data[i]) ;
     cpu_sum +=fabs(output_data_cpu[i]);
  }
  printf("err rate = %0.4f%%\n", err*100.0/cpu_sum);
  return 0;
}

Insert picture description here

2.5 experimental result

[email protected]:/home/zjq/cambrian-demo/powerDifference# make

/* g++ obj/plugin_power_difference_kernel.o obj/powerDiff.o obj/main.o -o main -L /home/zjq/AICSE-demo-student/env/neuware/lib64 -lcnrt ./main get data cost time 27.130000 ms CNRT: 4.2.1 fa5e44c compute data cost time 31.934000 ms input x 139.000000 input y 70.000000 output data 4760.000000 output data 15872.000000 output data 14880.000000 err rate = 0.0117% */

原网站

版权声明
本文为[No regrets!]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/179/202206281902549196.html