Torch的tensor

Torch中的tensor是一个n维数组,它是torch最基础最重要的组件。
下面这段代码演示如何通过下标来访问tensor。这段代码首先,初始化一个长度为10的1维数组,将其内容填充成0,1,2...9,然后遍历这个tensor,将其内容打印出来。
#include "TH/TH.h"

#define Real Double
#define real double

int main() {
 THTensor* tensor1 = THTensor_(newWithSize1d)(10);
 for (int64_t i = 0; i != 10; ++i) {
  THTensor_(set1d)(tensor1, i, (real)i);  
 }
 for (int i = 0; i != 10; ++i) {
  printf("%g ", THTensor_(get1d)(tensor1, i));
 }
 printf("\n");
 THTensor_(free)(tensor1);
 return 0;
}
输出应该是:
0 1 2 3 4 5 6 7 8 9
使用下标来访问tensor有两种方式,上面这种是通过set/get函数,比较慢。更快的方式是这样。
#include "TH/TH.h"

#define Real Double
#define real double

int main() {
  THTensor* tensor1 = THTensor_(newWithSize1d)(10);
  int i = 0;
  TH_TENSOR_APPLY(real, tensor1, *tensor1_data = (real)i; ++i;);
  for (int i = 0; i != 10; ++i) {
    printf("%g ", THTensor_(get1d)(tensor1, i));
  }
  printf("\n");
  THTensor_(free)(tensor1);
  return 0;
}
TH_TENSOR_APPLY是一个宏,在TH/THTensorApply.h中声明,它的code很复杂,但是又很需要读一读。本文之后的内容主要介绍这个宏的原理。

首先看下tensor的内部构造。
struct THTensor
{
    long *size;
    long *stride;
    int nDimension;
    
    THStorage *storage;
    long storageOffset;
    int refcount;
    char flag;
}
tensor的实际内容存在THStorage中。拿数据库来做类比,如果把THStorage看成是物理表,那么tensor就是视图。下面是从tensor获取storage指针,然后从storage通过THStorage_(data)函数获得裸指针,然后遍历这个tensor。如下面这段代码所示。
#include "TH/TH.h"

#define Real Double
#define real double

int main() {
 THTensor* tensor1 = THTensor_(newWithSize1d)(10);
 THStorage* s = THTensor_(storage)(tensor1);
 double* data = THStorage_(data)(s);
 int64_t size = THStorage_(size)(s);
 for (int64_t i = 0; i != size; ++i) {
  data[i] = (double)i;
 }
 THTensor_(free)(tensor1);
 return 0;
}
所以当需要实现向量的加减乘除、点乘等操作时,出于效率考虑,肯定会采用后一种方法,而不是采用set1d/get1d。于是问题来了。给定下标,如何计算出offset ?
以一个4维的tensor为例子
tensor[d][c][b][a] 的offset等于  a*stride[0] + b*stride[1] + c *stride[2] + d*stride[3]]
如下面的代码所示:
#include "TH/TH.h"
#include <assert.h>

#define Real Double
#define real double

int main() {
 THTensor* ts = THTensor_(newWithSize4d)(8, 4, 6, 7);
 long stride1 = THTensor_(stride)(ts, 0);
 long stride2 = THTensor_(stride)(ts, 1);
 long stride3 = THTensor_(stride)(ts, 2);
 long stride4 = THTensor_(stride)(ts, 3);
 printf("strides: %ld %ld %ld %ld\n", stride1, stride2, stride3, stride4);
 double* data = THTensor_(data)(ts);
 int a = 2;
 int b = 3;
 int c = 5;
 int d = 2;
 THTensor_(set4d)(ts, a, b, c, d, 12);
 assert(data[a*stride1 + b*stride2 + c *stride3 + d*stride4] == 12);
 THTensor_(free)(ts);
 return 0;
}
输出:
strides: 168 42 7 1
THTensor_(data) 函数返回的并不是struct THTensor中storage的data指针,而是data+storageOffset。

real *THTensor_(data)(const THTensor *self)
{
  if(self->storage)
    return (self->storage->data+self->storageOffset);
  else
    return NULL;
}

大多数时候我们无需关注这个细节。

对于刚new出来的tensor,torch可以保证tensor的最后一维总是连续的。所以strides数组的最后一个元素一定是1。它的strides满足:
stride[d] = isLastDimension(d) ? 1 : size[d+1] * stride[d+1];
但情况并非总是如此。比如一个2维的tensor经过转置之后,它依然和之前的tensor共享同一个storage。比如下面这段代码:
#include "TH/TH.h"
#include <assert.h>
#include <stdexcept>

#define Real Double
#define real double

static void printMatrix(THTensor* tensor) {
 int nDim = THTensor_(nDimension)(tensor);
 if (nDim != 2)
  throw std::runtime_error("dimension error");
 long size0 = THTensor_(size)(tensor, 0);
 long size1 = THTensor_(size)(tensor, 1);
 for (int i = 0; i != size0; ++i) {
  for (int j = 0; j != size1; ++j) {
   printf("%g ", THTensor_(get2d)(tensor, i, j));
  }
  printf("\n");
 }
}

int main() {
 THTensor* tensor1 = THTensor_(newWithSize2d)(3,4);
 THStorage* s = THTensor_(storage)(tensor1);
 long stride1 = THTensor_(stride)(tensor1, 0);
 long stride2 = THTensor_(stride)(tensor1, 1);
 printf("Before transpose:\n");
 printf("strides: %ld %ld\n", stride1, stride2);
 double* data = THStorage_(data)(s);
 int64_t size = THStorage_(size)(s);
 for (int64_t i = 0; i != size; ++i) {
  data[i] = (double)i;
 }
 printf("content:\n");
 printMatrix(tensor1); 
 printf("After transpose:\n");
 THTensor* tensor2 = THTensor_(newTranspose)(tensor1, 0, 1); 
 stride1 = THTensor_(stride)(tensor2, 0);
 stride2 = THTensor_(stride)(tensor2, 1); 
 printf("strides: %ld %ld\n", stride1, stride2);
 printf("content:\n");
 printMatrix(tensor1);
 THTensor_(free)(tensor2);
 THTensor_(free)(tensor1);
 return 0;
}
输出:
Before transpose:
strides: 4 1
content:
0 1 2 3
4 5 6 7
8 9 10 11
After transpose:
strides: 1 4
content:
0 1 2 3
4 5 6 7
8 9 10 11


这时候,如果想要遍历这个转置后的矩阵,就复杂的多了。对于一个n维数组,如果我们想遍历它,可以分配一个counter
size_t counter[dim]; //当前元素的下标
for(int i=0;i!=dim;++i) counter[0] = 0;

然后每遍历一个元素后,用下面的代码递增counter
bool isFinished = false;
for(int i=dim;i>=0;i--){
   counter[i]++;
   if(counter[i] != size[i]) break;
   if(i==0) {
     isFinished = true;
     break;
  }
   counter[i] = 0;
}
当isFinished=true的时候,终止所有循环。
这又变成了刚才的问题,如果我们对每个下标都用一系列的乘法和加法来计算offset,就会变得很低效。于是我们可以这样,用real* tensor1_data指向当前元素,然后在每次修改counter数组的时候,顺便也修正tensor1_data。
    for(i = tensor1_dim; i >= 0; i--) 
    { 
      tensor1_counter[i]++; 
      tensor1_data += tensor1->stride[i]; 

      if(tensor1_counter[i]  == tensor1->size[i]) 
      { 
        if(i == 0) 
        { 
          hasFinished = 1; 
          break; 
        } 
        else 
        { 
          tensor1_data -= tensor1_counter[i]*tensor1->stride[i]; 
          tensor1_counter[i] = 0; 
        } 
      } 
      else 
        break; 
    }
在此基础上可以再有另外一个优化,计算tensor的最低维部分有多少维是连续的,即满足 stride[d] = isLastDimension(d) ? 1 : size[d+1] * stride[d+1];
对于这些维度,我们可以用一个for循环一次遍历完。
完整的代码如下:(How to loop an n-dim array)
  real *tensor1_data = NULL; 
  long *tensor1_counter = NULL; 
  long tensor1_stride = 0, tensor1_size = 0, tensor1_dim = 0, i; 
  int hasFinished = 0; 

  if(tensor1->nDimension == 0) 
    hasFinished = 1; 
  else 
  { 
    tensor1_data = tensor1->storage->data+tensor1->storageOffset; 

    /* what is the first stride (ignore first dims=1)? */ 
    /* it will be used for the whole largest contiguous section */ 
    for(tensor1_dim = tensor1->nDimension-1; tensor1_dim >= 0; tensor1_dim--) 
    { 
      if(tensor1->size[tensor1_dim] != 1) 
        break; 
    } 
    tensor1_stride = (tensor1_dim == -1 ? 0 : tensor1->stride[tensor1_dim]); 

    /* what is the largest contiguous section? */ 
    tensor1_size = 1; 
    for(tensor1_dim = tensor1->nDimension-1; tensor1_dim >= 0; tensor1_dim--) 
    { 
      if(tensor1->size[tensor1_dim] != 1) 
      { 
        if(tensor1->stride[tensor1_dim] == tensor1_size) 
          tensor1_size *= tensor1->size[tensor1_dim]; 
        else 
          break; 
      } 
    } 

    /* counter over found dimensions */ 
    tensor1_counter = (long*)THAlloc(sizeof(long)*(tensor1_dim+1)); 
    for(i = 0; i <= tensor1_dim; i++) 
      tensor1_counter[i] = 0; 
  } 

  while(!hasFinished) 
  { 
    for(i = 0; i < tensor1_size; i++, tensor1_data += tensor1_stride) 
    { 
      //PUT USER CODE HERE 
    } 

    if(tensor1_dim == -1) 
       break; 
 
    tensor1_data -= i*tensor1_stride; 
    for(i = tensor1_dim; i >= 0; i--) 
    { 
      tensor1_counter[i]++; 
      tensor1_data += tensor1->stride[i]; 

      if(tensor1_counter[i]  == tensor1->size[i]) 
      { 
        if(i == 0) 
        { 
          hasFinished = 1; 
          break; 
        } 
        else 
        { 
          tensor1_data -= tensor1_counter[i]*tensor1->stride[i]; 
          tensor1_counter[i] = 0; 
        } 
      } 
      else 
        break; 
    } 
  } 
  THFree(tensor1_counter); 



此博客中的热门博文

少写代码,多读别人写的代码

在windows下使用llvm+clang

tensorflow distributed runtime初窥