From fa524355fad18e849a9ea0de3039d091fcce13dc Mon Sep 17 00:00:00 2001 From: Raul Puri Date: Sat, 5 Jan 2019 15:53:14 -0800 Subject: [PATCH] removing apex install dependencies problem --- apex_utils/apex/__init__.py | 3 - apex_utils/apex/utils/FusedNorm.py | 129 -------- apex_utils/apex/utils/__init__.py | 1 - apex_utils/csrc/Module.cpp | 249 -------------- apex_utils/csrc/kernel.cu | 304 ------------------ apex_utils/include/THCTensorInfo.cuh | 142 -------- apex_utils/include/kernel.h | 74 ----- finetune_classifier.py | 2 +- .../RNN_utils}/RNN/RNNBackend.py | 0 .../apex => model/RNN_utils}/RNN/__init__.py | 0 .../apex => model/RNN_utils}/RNN/cells.py | 0 .../apex => model/RNN_utils}/RNN/models.py | 0 model/model.py | 2 +- model/sentiment_classifier.py | 2 +- pretrain.py | 2 +- .../__init__.py | 0 .../reparameterization.py | 0 .../weight_norm.py | 0 run_classifier.py | 2 +- setup.py | 25 +- transfer.py | 2 +- 21 files changed, 9 insertions(+), 930 deletions(-) delete mode 100644 apex_utils/apex/__init__.py delete mode 100644 apex_utils/apex/utils/FusedNorm.py delete mode 100644 apex_utils/apex/utils/__init__.py delete mode 100644 apex_utils/csrc/Module.cpp delete mode 100644 apex_utils/csrc/kernel.cu delete mode 100644 apex_utils/include/THCTensorInfo.cuh delete mode 100644 apex_utils/include/kernel.h rename {apex_utils/apex => model/RNN_utils}/RNN/RNNBackend.py (100%) rename {apex_utils/apex => model/RNN_utils}/RNN/__init__.py (100%) rename {apex_utils/apex => model/RNN_utils}/RNN/cells.py (100%) rename {apex_utils/apex => model/RNN_utils}/RNN/models.py (100%) rename {apex_utils/apex/reparameterization => reparameterization}/__init__.py (100%) rename {apex_utils/apex/reparameterization => reparameterization}/reparameterization.py (100%) rename {apex_utils/apex/reparameterization => reparameterization}/weight_norm.py (100%) diff --git a/apex_utils/apex/__init__.py b/apex_utils/apex/__init__.py deleted file mode 100644 index dff8651..0000000 --- a/apex_utils/apex/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from . import RNN -from . import reparameterization -#from . import utils diff --git a/apex_utils/apex/utils/FusedNorm.py b/apex_utils/apex/utils/FusedNorm.py deleted file mode 100644 index e2c65d5..0000000 --- a/apex_utils/apex/utils/FusedNorm.py +++ /dev/null @@ -1,129 +0,0 @@ -import torch -from torch.autograd import Variable -from torch.autograd.function import Function, once_differentiable -import apex._C - -class FusedNorm(Function): - """ - Normalizes (using L2 norm) a Variable across the slowest dimension. - Assumes the input Variable is contiguous. - Internally, all calculations are performed in fp32, regardless - of input/output precision. - """ - - @staticmethod - def forward(ctx, input): - """ - input is assumed to be contiguous. - input may be either float or half precision. - The precision of output will match the precision of input. - A float copy of the L2 norm across each slow dimension - is also created and saved for the backward pass. - """ - # torch.cuda.nvtx.range_push("FusedNorm.forward, input.size() = {}" - # .format(input.size())) - - if not input.is_contiguous(): - raise RuntimeError("In FusedNorm.forward(): " - "input with size {} is not contiguous" - .format(input.size())) - if not input.is_cuda: - raise RuntimeError("In FusedNorm.forward(): input.is_cuda = False." - "Currently, only cuda tensors are supported.") - - """ - This is ok, new() treats a torch.Size object properly. - No need to unpack with an asterisk via new(*input.size()). - """ - output = input.new(input.size()).contiguous() - - """ - For output with size (slow, faster, faster, ...fastest), we may want - norms with size (slow, 1, 1, ...1), so that if you want retrieve norms - and apply the same normalizing factors to another Tensor "t" with the - same size as output, "t/norms" will broadcast each element of norms - across the corresponding slowest dim of t. - """ - norm_size = (output.size(0),) + (1,)*(output.dim()-1) - norms = torch.cuda.FloatTensor(*norm_size).contiguous() - """ - Beware: If you call the following: - norms = torch.cuda.FloatTensor(norm_size).contiguous() - the constructor sees a tuple: - FloatTensor( (output_size(0),1,1,...) ) - and creates a 1D tensor with values from the tuple: - [output_size(0),1,1,...]. - """ - - # torch.cuda.synchronize() - - # print("norms = ", norms) - # print("norms.size () = ", norms.size()) - # print("norms.stride() = ", norms.stride()) - - # print("type(input) = ", type(input)) - # print("type(output) = ", type(output)) - # print("type(norms) = ", type(norms)) - # print( "input.data_ptr = {:x}".format(input.data_ptr())) - - apex._C.norm_fwd(input, output, norms) - # apex._C.norm_fwd(input.data, output.data, norms) - - # torch.cuda.synchronize() - - # print("norms in forward(): ", norms) - - ctx.save_for_backward(input) - - # save_for_backward can only save input or output tensors, - # so here's a hacky workaround to save the norms: - ctx.norms = norms - - # torch.cuda.nvtx.range_pop() - - return output - - @staticmethod - @once_differentiable - def backward(ctx, grad_output): - """ - grad_output is assumed to be contiguous. - grad_output may be either float or half precision. - The precision of grad_input will match the precision of grad_output. - """ - # torch.cuda.nvtx.range_push("FusedNorm.backward, grad_output.size() = {}" - # .format(grad_output.size())) - - if not grad_output.is_cuda: - raise RuntimeError("In FusedNorm.backward(): grad_output.is_cuda = False." - "Currently, only cuda tensors are supported.") - - savedInput, = ctx.saved_tensors - norms = ctx.norms - - # better safe than sorry - grad_output_contig = grad_output.contiguous() - grad_input = grad_output_contig.new(grad_output.size()).contiguous() - - apex._C.norm_bwd(grad_output_contig, grad_input, savedInput, norms) - # apex._C.norm_bwd(grad_output_contig.data, grad_input.data, savedInput.data, norms) - - # torch.cuda.nvtx.range_pop() - - # print("\n\n") - # print("grad_output.is_contiguous() = {:x}".format(grad_output.is_contiguous())) - # print(" grad_input.is_contiguous() = {:x}".format( grad_input.is_contiguous())) - # print(" savedInput.is_contiguous() = {:x}".format( savedInput.is_contiguous())) - # print(" norms.is_contiguous() = {:x}".format( norms.is_contiguous())) - # print("\n\n") - # print("grad_output.data_ptr = {:x}".format(grad_output.data_ptr())) - # print(" grad_input.data_ptr = {:x}".format( grad_input.data_ptr())) - # print(" savedInput.data_ptr = {:x}".format( savedInput.data_ptr())) - # print(" norms.data_ptr = {:x}".format( norms.data_ptr())) - # print("\n\n") - # print("grad_output in backward(): ", grad_output) - # print(" grad_input in backward(): ", grad_input) - # print(" savedInput in backward(): ", savedInput) - # print(" norms in backward(): ", norms) - - return grad_input diff --git a/apex_utils/apex/utils/__init__.py b/apex_utils/apex/utils/__init__.py deleted file mode 100644 index 423daac..0000000 --- a/apex_utils/apex/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .FusedNorm import FusedNorm diff --git a/apex_utils/csrc/Module.cpp b/apex_utils/csrc/Module.cpp deleted file mode 100644 index 27ae9d1..0000000 --- a/apex_utils/csrc/Module.cpp +++ /dev/null @@ -1,249 +0,0 @@ -#define PY_SSIZE_T_CLEAN -#define ARG_OFFSET 5 - -#include - -#include -#include - -#include -#include -#include -#include -#include - -// #define USE_NVTX -#ifdef USE_NVTX -#include "nvToolsExt.h" -#endif - -//Meta-data format we will use -#include - -//Cuda kernels -#include - -#define ERROR_MSG cout << "Error at " << __FILE__ << ":" << __LINE__ << "\n"; - -using namespace std; -TensorInfo PyOb_2_tinfo(PyObject* tensor, float_types data_type) -{ - PyObject* PyStrides = PyObject_CallMethod(tensor, "stride", NULL); - if(PyStrides == NULL) - { - ERROR_MSG; - cout << "PyStrides = NULL" << endl; - } - - PyObject* PySizes = PyObject_CallMethod(tensor, "size", NULL); - if(PySizes == NULL) - { - ERROR_MSG; - cout << "PySizes = NULL" << endl; - } - - PyObject* PyDataPtr = PyObject_CallMethod(tensor, "data_ptr", NULL); - if(PyDataPtr == NULL) - { - ERROR_MSG; - cout << "PyDataPtr = NULL" << endl; - } - - void* data_ptr = (void*) PyLong_AsLong(PyDataPtr); - Py_ssize_t ndims = PyList_GET_SIZE(PySizes); - //TODO put proper checking on ndims < MAX_CUTORCH_DIMS - IDXTYPE strides[MAX_CUTORCH_DIMS], sizes[MAX_CUTORCH_DIMS]; - - for(int i = 0; i < ndims; i++) - { - strides[i] = PyLong_AsLong(PyTuple_GetItem(PyStrides, i)); - sizes[i] = PyLong_AsLong(PyTuple_GetItem(PySizes, i)); - } - - // Reference counts still behave strangely, but at least these appear to cap - // the process' memory usage. - Py_DECREF(PyStrides); - Py_DECREF(PySizes); - Py_DECREF(PyDataPtr); - - return TensorInfo(data_ptr, ndims, sizes, strides, data_type); -} - -vector > get_TInfos(PyObject* args) -{ - vector > info_vec; -#ifdef DEBUG_ANY - cout << "Processing " << PyTuple_GET_SIZE(args) << " arguments" << endl; -#endif - -#ifdef CHECK_MEMLEAK - for(int iter = 0; iter < 1e7; iter++ ) -#endif - for(Py_ssize_t i=0; iob_type->tp_name); - - PyObject* pyObjTypeCall = PyObject_CallMethod(pyTensor, "type", NULL); - if(pyObjTypeCall == NULL) - { - ERROR_MSG; - cout << "For args item " << i << ", pyObjTypeCall = NULL" << endl; - } - - // This gives a segfault: - // cout << "pyObjTypeCall direct conversion attempt = " << - // PyBytes_AsString(pyObjTypeCall) << endl; - - PyObject* pyObjASCII = PyUnicode_AsASCIIString(pyObjTypeCall); - if(pyObjASCII == NULL) - { - ERROR_MSG; - cout << "For args item " << i << ", pyObjASCII = NULL " << endl; - } - - // cout << "Py_REFCNT(pyObjTypeCall) = " << Py_REFCNT(pyObjTypeCall) << endl; - Py_DECREF(pyObjTypeCall); - - string objTypeCall(PyBytes_AsString(pyObjASCII)); - - // cout << "Py_REFCNT(pyObjASCII) = " << Py_REFCNT(pyObjASCII) << endl; - Py_DECREF(pyObjASCII); - -#ifdef DEBUG_ANY - cout << "arg " << i << endl; - cout << "objType = " << objType << endl; - cout << "objTypeCall = " << objTypeCall << endl; -#endif - - if(objTypeCall == "torch.cuda.FloatTensor") -#ifdef CHECK_MEMLEAK - if(iter == 0 ) -#endif - info_vec.push_back(PyOb_2_tinfo(pyTensor, FLOAT)); -#ifdef CHECK_MEMLEAK - else - info_vec[i] = PyOb_2_tinfo(pyTensor, FLOAT); -#endif - else if(objTypeCall == "torch.cuda.HalfTensor") - info_vec.push_back(PyOb_2_tinfo(pyTensor, HALF)); - // Could add double - else - { - ERROR_MSG; - cout << "For args item " << i << ", unsupported .type() found: " - << objTypeCall << "\n" - "Supported types:\n" - "torch.cuda.FloatTensor\n" - "torch.cuda.HalfTensor\n" - "torch.autograd.variable.Variable containing FloatTensor\n" - "torch.autograd.variable.Variable containing HalfTensor\n" - "torch.nn.parameter.Parameter containing FloatTensor\n" - "torch.nn.parameter.Parameter containing HalfTensor\n" - << endl; - } - } - - // PyErr_SetString(PyExc_RuntimeError, "Exception set in "); - - return info_vec; -} - -//Will extract all tensors in order. Assumes flat structure, tensors can not be wrapped in lists -//tuples or any other iterator structure. -static PyObject* norm_fwd(PyObject* self, PyObject* args) -{ -#ifdef USE_NVTX -nvtxRangePushA("norm_fwd C backend"); -#endif - - vector >tensors = get_TInfos(args); - -#ifdef DEBUG_ANY - cout << "tensors.size() = " << tensors.size() << endl; -#endif - - IDXTYPE totalElems = 1; - for(int i = 0; i < tensors[0].dims; i++ ) - totalElems *= tensors[0].sizes[i]; - send_to_fwd - ( - tensors[0], - tensors[1], - tensors[2], - totalElems - ); - -#ifdef USE_NVTX -nvtxRangePop(); -#endif - - Py_RETURN_NONE; -} - -static PyObject* norm_bwd(PyObject* self, PyObject* args) -{ -#ifdef USE_NVTX -nvtxRangePushA("norm_bwd C backend"); -#endif - - vector >tensors = get_TInfos(args); - -#ifdef DEBUG_ANY - cout << "tensors.size() = " << tensors.size() << endl; -#endif - - IDXTYPE totalElems = 1; - for(int i = 0; i < tensors[0].dims; i++ ) - totalElems *= tensors[0].sizes[i]; - send_to_bwd - ( - tensors[0], - tensors[1], - tensors[2], - tensors[3], - totalElems - ); - -#ifdef USE_NVTX -nvtxRangePop(); -#endif - - Py_RETURN_NONE; -} - - - -//*******************PYTHON BOILER PLATE******************* -static PyMethodDef apex_methods[] = { - {"norm_fwd", (PyCFunction) norm_fwd, METH_VARARGS, "Slowest-dim norm, forward pass."}, - {"norm_bwd", (PyCFunction) norm_bwd, METH_VARARGS, "Slowest-dim norm, backward pass."}, - {NULL, NULL, 0, NULL} -}; - -#if PY_MAJOR_VERSION >= 3 - -//Module Definitions -static struct PyModuleDef apex = { - PyModuleDef_HEAD_INIT, "apex._C", "Module to add CUDA extensions to Pytorch.", -1, apex_methods -}; -//Initialization Function -PyMODINIT_FUNC PyInit__C(void){ - - //Let's throw an error if we can't find pytorch. - PyImport_ImportModule("torch"); - Py_Initialize(); - return PyModule_Create(&apex); -} -#else -PyMODINIT_FUNC initMODULE(void){ - //Let's throw an error if we can't find pytorch. - PyImport_ImportModule("torch"); - (void) Py_InitModule3("apex._C", apex, "A PyTorch Extension."); -} - -#endif -//********************************************************* - diff --git a/apex_utils/csrc/kernel.cu b/apex_utils/csrc/kernel.cu deleted file mode 100644 index c755fde..0000000 --- a/apex_utils/csrc/kernel.cu +++ /dev/null @@ -1,304 +0,0 @@ -#include "../include/kernel.h" - -template struct TtoInt { static const int test = -1; }; -template<> struct TtoInt { static const int test = 0; }; -template<> struct TtoInt { static const int test = 0; }; -template<> struct TtoInt { static const int test = 0; }; - -#if __CUDACC_VER_MAJOR__ >= 9 -#define __SHFL_DOWN(var, delta) __shfl_down_sync(0xffffffff, var, delta) -#else -#define __SHFL_DOWN(var, delta) __shfl_down(var, delta) -#endif - -#if __CUDACC_VER_MAJOR__ >= 9 -#define __SYNCWARP __syncwarp() -#else -#define __SYNCWARP -#endif - -#define BLOCK 256 - -using namespace std; - -template -__device__ __forceinline__ T block_reduce(T *x, T val) -{ - int tidx = threadIdx.x; - if(blockDim.x >= 64) - { - x[tidx] = val; - __syncthreads(); - } - - #pragma unroll - for(int i = (blockDim.x >> 1); i >= 64; i >>= 1) - { - if( tidx < i ) - x[tidx] += x[tidx+i]; // JoinOp - __syncthreads(); - } - - if(tidx < 32) - { - T final; - if(blockDim.x >= 64) - final = x[tidx] + x[tidx+32]; - else - final = val; - // __SYNCWARP(); - - #pragma unroll - for( int i = 16; i > 0; i >>= 1) - final += __SHFL_DOWN(final, i); - - if(tidx == 0) - x[0] = final; - } - - __syncthreads(); - return x[0]; -} - -template -__global__ void norm_fwd_kernel -( - TensorInfo input, - TensorInfo output, - TensorInfo norms, - IndexType totalElems, - IndexType rowSize -) -{ - // We are norming each slowest-dim row of the tensor separately. - // For now, assign one block to each row. - IndexType tid = threadIdx.x; - IndexType row = blockIdx.x; - IndexType stride = blockDim.x; - - // Logical index offset for this flattened row - IndexType rowStart = row*rowSize; - - extern __shared__ float s[]; - - float thread_sum = 0.f; - for(IndexType i = tid; i < rowSize; i += stride ) - { - float val_f = ScalarConvert::to(DEVICE_LINEAR_GET(input, i + rowStart)); - thread_sum += val_f*val_f; // AccumOp, could do Kahan here - } - - float result = block_reduce(s, thread_sum); - - // if(tid == 0) - // printf("norm for row %d = %f\n", row, sqrtf(result)); - - if(tid == 0) - DEVICE_LINEAR_GET_F(norms, row) = sqrtf(result); - - // Write data to output - for(IndexType i = tid; i < rowSize; i += stride ) - { - float val_f = ScalarConvert::to(DEVICE_LINEAR_GET(input, i + rowStart)); - DEVICE_LINEAR_GET(output, i + rowStart) = ScalarConvert::to(val_f*rsqrtf(result)); - } -} - -template -__global__ void norm_bwd_kernel -( - TensorInfo pLpOutput, - TensorInfo pLpInput, - TensorInfo savedInput, - TensorInfo savedNorms, - IndexType totalElems, - IndexType rowSize -) -{ - // For now, assign one block to each row. - IndexType tid = threadIdx.x; - IndexType row = blockIdx.x; - IndexType stride = blockDim.x; - - // Logical index offset for this flattened row - IndexType rowStart = row*rowSize; - - extern __shared__ float s[]; - - float thread_sum = 0.f; - for(IndexType i = tid; i < rowSize; i += stride ) - { - float pLpOutputi = ScalarConvert::to(DEVICE_LINEAR_GET(pLpOutput, i + rowStart)); - float savedInputi = ScalarConvert::to(DEVICE_LINEAR_GET(savedInput, i + rowStart)); - thread_sum += pLpOutputi*savedInputi; // AccumOp, could do Kahan here - } - - float result = block_reduce(s, thread_sum); - - // if(tid == 0) - // { - // printf - // ( - // "blockDim.x = %ld\n" - // "pLpOutput data pointer = %lx\n" - // "pLpInput data pointer = %lx\n" - // "savedInput data pointer = %lx\n" - // "savedNorms data pointer = %lx\n", - // blockDim.x, - // pLpOutput.data, - // pLpInput.data, - // savedInput.data, - // savedNorms.data - // ); - // printf("result for row %d = %f\n", row, result ); - // printf("thread_sum for row %d = %f\n", row, thread_sum); - // } - - // Could choose to save reciprocal of norm instead I suppose, but norms is probably - // more handy to keep around - float rnorm = 1.f/DEVICE_LINEAR_GET_F(savedNorms, row); - float rnorm3 = rnorm*rnorm*rnorm; - - // Write data to output. We are reusing values that were loaded earlier, so there - // is an optimization opportunity here (store values persistently). - for(IndexType j = tid; j < rowSize; j += stride ) - { - float pLpOutputj = ScalarConvert::to(DEVICE_LINEAR_GET(pLpOutput, j + rowStart)); - float savedInputj = ScalarConvert::to(DEVICE_LINEAR_GET(savedInput, j + rowStart)); - float pLpInputj = rnorm*pLpOutputj - rnorm3*savedInputj*result; - DEVICE_LINEAR_GET(pLpInput, j + rowStart) = ScalarConvert::to(pLpInputj); - } -} - -// template -template -void send_to_fwd -( - TensorInfo input, - TensorInfo output, - TensorInfo norms, - IndexType totalElems -) -{ -#ifdef DEBUG_ANY - cout << "hello from send_to_fwd with input.type = " << input.type << endl; -#endif - - // Find logical size of each flattened slowest-dim row - IndexType rowSize = 1; - for(IndexType i = input.dims - 1; i > 0; i--) - rowSize *= input.sizes[i]; - - switch(input.type) - { - case FLOAT: -#ifdef DEBUG_ANY - cout << "case FLOAT" << endl; -#endif - norm_fwd_kernel<<>> - ( - *((TensorInfo*)&input), // Safer: Make a copy constructor that constructs - *((TensorInfo*)&output), // the typed version from a void, instead of a cast. - *((TensorInfo*)&norms), - totalElems, - rowSize - ); - break; - case HALF: -#ifdef DEBUG_ANY - cout << "case HALF" << endl; -#endif - norm_fwd_kernel<<>> - ( - *((TensorInfo*)&input), - *((TensorInfo*)&output), - *((TensorInfo*)&norms), - totalElems, - rowSize - ); - break; - default: - std::cout << "Unsupported input.type in send_to_fwd()" << std::endl; - cudaDeviceSynchronize(); - exit(-1); - } -#ifdef DEBUG_PROFILE - cudaDeviceSynchronize(); -#endif -} - -// template -template -void send_to_bwd -( - TensorInfo pLpOutput, - TensorInfo pLpInput, - TensorInfo savedInput, - TensorInfo savedNorms, - IndexType totalElems -) -{ -#ifdef DEBUG_ANY - cout << "Hello from send_to_bwd with pLpOutput.type = " << pLpOutput.type << endl; -#endif - - // Find logical size of each flattened slowest-dim row - IndexType rowSize = 1; - for(IndexType i = savedInput.dims - 1; i > 0; i--) - rowSize *= savedInput.sizes[i]; - - switch(pLpOutput.type) - { - case FLOAT: -#ifdef DEBUG_ANY - cout << "case FLOAT" << endl; -#endif - norm_bwd_kernel<<>> - ( - *((TensorInfo*)&pLpOutput), - *((TensorInfo*)&pLpInput), - *((TensorInfo*)&savedInput), - *((TensorInfo*)&savedNorms), - totalElems, - rowSize - ); - break; - case HALF: -#ifdef DEBUG_ANY - cout << "case HALF" << endl; -#endif - norm_bwd_kernel<<>> - ( - *((TensorInfo*)&pLpOutput), - *((TensorInfo*)&pLpInput), - *((TensorInfo*)&savedInput), - *((TensorInfo*)&savedNorms), - totalElems, - rowSize - ); - break; - default: - cout << "Unsupported pLpOutput.type in send_to_bwd()" << std::endl; - cudaDeviceSynchronize(); - exit(-1); - } -#ifdef DEBUG_PROFILE - cudaDeviceSynchronize(); -#endif -} - -template void send_to_fwd -( - TensorInfo, - TensorInfo, - TensorInfo, - IDXTYPE -); -template void send_to_bwd -( - TensorInfo, - TensorInfo, - TensorInfo, - TensorInfo, - IDXTYPE -); diff --git a/apex_utils/include/THCTensorInfo.cuh b/apex_utils/include/THCTensorInfo.cuh deleted file mode 100644 index dec6a54..0000000 --- a/apex_utils/include/THCTensorInfo.cuh +++ /dev/null @@ -1,142 +0,0 @@ -#ifndef THC_TENSOR_INFO_INC -#define THC_TENSOR_INFO_INC - -#include -#include -#include - -// Maximum number of dimensions allowed for cutorch -#define MAX_CUTORCH_DIMS 10 - -// Warning string for tensor arguments that are too large or have too -// many dimensions -#define CUTORCH_STR(X) #X -#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \ - CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions" - -enum float_types { FLOAT = 0 , HALF = 1, DOUBLE = 2 }; - -// CUDA kernel argument that defines tensor layout -template -struct TensorInfo { - - TensorInfo(T* p, - int dim, - IndexType sz[MAX_CUTORCH_DIMS], - IndexType st[MAX_CUTORCH_DIMS]); - - TensorInfo(T* p, - int dim, - IndexType sz[MAX_CUTORCH_DIMS], - IndexType st[MAX_CUTORCH_DIMS], - float_types type); - - //Good way to cast from another format - //template > - //TensorInfo(TensorInfo &tinfo_in){ - // data = reinterpret_cast(tinfo_in.data); - //} - - T* data; - IndexType sizes[MAX_CUTORCH_DIMS]; - IndexType strides[MAX_CUTORCH_DIMS]; - int dims; - float_types type; -}; - -//Expand our combinations as convenient typedefs -typedef TensorInfo t_hi; -typedef TensorInfo t_hl; -typedef TensorInfo t_fi; -typedef TensorInfo t_fl; - - -template -TensorInfo::TensorInfo(T* p, - int dim, - IndexType sz[MAX_CUTORCH_DIMS], - IndexType st[MAX_CUTORCH_DIMS]) { - data = p; - dims = dim; - assert(dims > 0 && dims < MAX_CUTORCH_DIMS); - - for (int i = 0; i < dim; ++i) { - sizes[i] = sz[i]; - strides[i] = st[i]; - } -} - -template -TensorInfo::TensorInfo(T* p, - int dim, - IndexType sz[MAX_CUTORCH_DIMS], - IndexType st[MAX_CUTORCH_DIMS], - float_types _type){ - data = p; - dims = dim; - assert(dims > 0 && dims < MAX_CUTORCH_DIMS); - - for (int i = 0; i < dim; ++i) { - sizes[i] = sz[i]; - strides[i] = st[i]; - } - type=_type; -} - - - -// Translate a linear index for the apply to a T* offset; -// specialized on `Dims` to reduce nvcc compilation time -template -struct IndexToOffset { - static __forceinline__ __host__ __device__ IndexType get( - IndexType linearId, - const TensorInfo& info) { - IndexType offset = 0; - - // Use static dims - for (int i = Dims - 1; i > 0; --i) { - for (int i = Dims - 1; i > 0; --i) { - offset += linearId % info.sizes[i] * info.strides[i]; - linearId /= info.sizes[i]; - } - - offset += linearId * info.strides[0]; - return offset; - } - } -}; - - - -// For contiguous tensors, the offset = index -template -struct IndexToOffset { - static __forceinline__ __host__ __device__ IndexType - get(IndexType linearId, const TensorInfo& info) { - return linearId; - } -}; - -template -struct IndexToOffset { - static __forceinline__ __host__ __device__ IndexType get( - IndexType linearId, - const TensorInfo& info) { - - IndexType offset = 0; - - // Use dynamic dims - for (int i = info.dims - 1; i >= 0; --i) { - IndexType curDimIndex = linearId % info.sizes[i]; - IndexType curDimOffset = curDimIndex * info.strides[i]; - offset += curDimOffset; - - linearId /= info.sizes[i]; - } - - return offset; - } -}; - -#endif // THC_TENSOR_INFO_INC diff --git a/apex_utils/include/kernel.h b/apex_utils/include/kernel.h deleted file mode 100644 index 1a2244c..0000000 --- a/apex_utils/include/kernel.h +++ /dev/null @@ -1,74 +0,0 @@ -#include "THCTensorInfo.cuh" -#include -#include -#include -#include -#define Dims -2 -#define DEVICE_LINEAR_GET(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset::get(INDEX, D_TENSOR)] -#define DEVICE_LINEAR_GET_F(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset::get(INDEX, D_TENSOR)] - -// template -// void send_to_kernel( -// TensorInfo Input_1, -// TensorInfo Input_2, -// IndexType totalElems -// ); - -template -void send_to_fwd -( - TensorInfo input, // Forward-pass input - TensorInfo output, // Forward-pass output - TensorInfo norms, - IndexType totalElems -); - -template -void send_to_bwd -( - TensorInfo pLpOutput, // Incoming backward-pass gradients wrt forward-pass outputs - TensorInfo pLpInput, // Result: the gradients with respect to forward-pass inputs - TensorInfo savedInput, - TensorInfo norms, - IndexType totalElems -); - -template -struct ScalarConvert { - static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; } -}; - -#ifdef CUDA_HALF_TENSOR -template -struct ScalarConvert { - static __host__ __device__ __forceinline__ Out to(const half v) { -#ifdef __CUDA_ARCH__ - return (Out) __half2float(v); -#else - return (Out) THC_half2float(v); -#endif - } -}; - -template -struct ScalarConvert { - static __host__ __device__ __forceinline__ half to(const In v) { -#ifdef __CUDA_ARCH__ - return __float2half((float) v); -#else - return THC_float2half((float) v); -#endif - } -}; - -template <> -struct ScalarConvert { - static __host__ __device__ __forceinline__ half to(const half v) { - return v; - } -}; - -#endif - - -typedef int IDXTYPE; diff --git a/finetune_classifier.py b/finetune_classifier.py index dea4dfe..86d10c7 100755 --- a/finetune_classifier.py +++ b/finetune_classifier.py @@ -16,7 +16,7 @@ from logreg_utils import train_logreg from fp16 import FP16_Module, FP16_Optimizer -from apex.reparameterization import apply_weight_norm, remove_weight_norm +from reparameterization import apply_weight_norm, remove_weight_norm import model as M from tqdm import tqdm diff --git a/apex_utils/apex/RNN/RNNBackend.py b/model/RNN_utils/RNN/RNNBackend.py similarity index 100% rename from apex_utils/apex/RNN/RNNBackend.py rename to model/RNN_utils/RNN/RNNBackend.py diff --git a/apex_utils/apex/RNN/__init__.py b/model/RNN_utils/RNN/__init__.py similarity index 100% rename from apex_utils/apex/RNN/__init__.py rename to model/RNN_utils/RNN/__init__.py diff --git a/apex_utils/apex/RNN/cells.py b/model/RNN_utils/RNN/cells.py similarity index 100% rename from apex_utils/apex/RNN/cells.py rename to model/RNN_utils/RNN/cells.py diff --git a/apex_utils/apex/RNN/models.py b/model/RNN_utils/RNN/models.py similarity index 100% rename from apex_utils/apex/RNN/models.py rename to model/RNN_utils/RNN/models.py diff --git a/model/model.py b/model/model.py index abeff81..e92eb06 100644 --- a/model/model.py +++ b/model/model.py @@ -5,7 +5,7 @@ from torch.autograd import Variable import torch.nn.functional as F -from apex import RNN +from .RNN_utils import RNN from .transformer_utils import Embedding from .transformer import TransformerDecoder diff --git a/model/sentiment_classifier.py b/model/sentiment_classifier.py index 1d78c2b..3ff60e2 100644 --- a/model/sentiment_classifier.py +++ b/model/sentiment_classifier.py @@ -4,7 +4,7 @@ import numpy as np from itertools import chain -from model import RNNFeaturizer, TransformerFeaturizer +from .model import RNNFeaturizer, TransformerFeaturizer from .transformer_utils import GeLU class BinaryClassifier(nn.Module): diff --git a/pretrain.py b/pretrain.py index 5adfdc7..66af886 100755 --- a/pretrain.py +++ b/pretrain.py @@ -17,7 +17,7 @@ import model as m from model import DistributedDataParallel as DDP -from apex.reparameterization import apply_weight_norm, remove_weight_norm +from reparameterization import apply_weight_norm, remove_weight_norm from configure_data import configure_data from learning_rates import AnnealingLR, WarmupLR, SlantedTriangularLR from arguments import add_general_args, add_model_args, add_unsupervised_data_args diff --git a/apex_utils/apex/reparameterization/__init__.py b/reparameterization/__init__.py similarity index 100% rename from apex_utils/apex/reparameterization/__init__.py rename to reparameterization/__init__.py diff --git a/apex_utils/apex/reparameterization/reparameterization.py b/reparameterization/reparameterization.py similarity index 100% rename from apex_utils/apex/reparameterization/reparameterization.py rename to reparameterization/reparameterization.py diff --git a/apex_utils/apex/reparameterization/weight_norm.py b/reparameterization/weight_norm.py similarity index 100% rename from apex_utils/apex/reparameterization/weight_norm.py rename to reparameterization/weight_norm.py diff --git a/run_classifier.py b/run_classifier.py index 7d9a84d..a2433d7 100755 --- a/run_classifier.py +++ b/run_classifier.py @@ -13,7 +13,7 @@ import numpy as np import pandas as pd -from apex.reparameterization import apply_weight_norm, remove_weight_norm +from reparameterization import apply_weight_norm, remove_weight_norm from model import SentimentClassifier from configure_data import configure_data diff --git a/setup.py b/setup.py index 2745298..8bc4f68 100644 --- a/setup.py +++ b/setup.py @@ -2,39 +2,20 @@ from setuptools import setup, find_packages import torch -curdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), - 'apex_utils') -os.chdir(curdir) - -if not torch.cuda.is_available(): - print("Warning: Torch did not find available GPUs on this system.\n", - "If your intention is to cross-compile, this is not an error.") - print("torch.__version__ = ", torch.__version__) TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) if TORCH_MAJOR == 0 and TORCH_MINOR < 4: - raise RuntimeError("APEx requires Pytorch 0.4 or newer.\n" + + raise RuntimeError("Sentiment Discovery requires Pytorch 0.4 or newer.\n" + "The latest stable release can be obtained from https://pytorch.org/") print("Building module.") setup( - name='apex', version='0.1', + name='sentiment_discovery', version='0.4', # ext_modules=[cuda_ext,], description='PyTorch Extensions written by NVIDIA', - packages=find_packages(where='.', - exclude=( - "build", - "csrc", - "include", - "tests", - "dist", - "docs", - "tests", - "examples", - "apex.egg-info", - )), + packages=find_packages(where='.'), install_requires=[ "numpy", "pandas", diff --git a/transfer.py b/transfer.py index 37c73dc..2a64adb 100644 --- a/transfer.py +++ b/transfer.py @@ -18,7 +18,7 @@ from logreg_utils import train_logreg from fp16 import FP16_Module, FP16_Optimizer -from apex.reparameterization import apply_weight_norm, remove_weight_norm +from reparameterization import apply_weight_norm, remove_weight_norm from model import RNNFeaturizer, TransformerFeaturizer from configure_data import configure_data