From fa524355fad18e849a9ea0de3039d091fcce13dc Mon Sep 17 00:00:00 2001
From: Raul Puri <raulp@nvidia.com>
Date: Sat, 5 Jan 2019 15:53:14 -0800
Subject: [PATCH] removing apex install dependencies problem

---
 apex_utils/apex/__init__.py                   |   3 -
 apex_utils/apex/utils/FusedNorm.py            | 129 --------
 apex_utils/apex/utils/__init__.py             |   1 -
 apex_utils/csrc/Module.cpp                    | 249 --------------
 apex_utils/csrc/kernel.cu                     | 304 ------------------
 apex_utils/include/THCTensorInfo.cuh          | 142 --------
 apex_utils/include/kernel.h                   |  74 -----
 finetune_classifier.py                        |   2 +-
 .../RNN_utils}/RNN/RNNBackend.py              |   0
 .../apex => model/RNN_utils}/RNN/__init__.py  |   0
 .../apex => model/RNN_utils}/RNN/cells.py     |   0
 .../apex => model/RNN_utils}/RNN/models.py    |   0
 model/model.py                                |   2 +-
 model/sentiment_classifier.py                 |   2 +-
 pretrain.py                                   |   2 +-
 .../__init__.py                               |   0
 .../reparameterization.py                     |   0
 .../weight_norm.py                            |   0
 run_classifier.py                             |   2 +-
 setup.py                                      |  25 +-
 transfer.py                                   |   2 +-
 21 files changed, 9 insertions(+), 930 deletions(-)
 delete mode 100644 apex_utils/apex/__init__.py
 delete mode 100644 apex_utils/apex/utils/FusedNorm.py
 delete mode 100644 apex_utils/apex/utils/__init__.py
 delete mode 100644 apex_utils/csrc/Module.cpp
 delete mode 100644 apex_utils/csrc/kernel.cu
 delete mode 100644 apex_utils/include/THCTensorInfo.cuh
 delete mode 100644 apex_utils/include/kernel.h
 rename {apex_utils/apex => model/RNN_utils}/RNN/RNNBackend.py (100%)
 rename {apex_utils/apex => model/RNN_utils}/RNN/__init__.py (100%)
 rename {apex_utils/apex => model/RNN_utils}/RNN/cells.py (100%)
 rename {apex_utils/apex => model/RNN_utils}/RNN/models.py (100%)
 rename {apex_utils/apex/reparameterization => reparameterization}/__init__.py (100%)
 rename {apex_utils/apex/reparameterization => reparameterization}/reparameterization.py (100%)
 rename {apex_utils/apex/reparameterization => reparameterization}/weight_norm.py (100%)

diff --git a/apex_utils/apex/__init__.py b/apex_utils/apex/__init__.py
deleted file mode 100644
index dff8651..0000000
--- a/apex_utils/apex/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from . import RNN
-from . import reparameterization
-#from . import utils
diff --git a/apex_utils/apex/utils/FusedNorm.py b/apex_utils/apex/utils/FusedNorm.py
deleted file mode 100644
index e2c65d5..0000000
--- a/apex_utils/apex/utils/FusedNorm.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import torch
-from torch.autograd import Variable
-from torch.autograd.function import Function, once_differentiable
-import apex._C
-
-class FusedNorm(Function):
-    """
-    Normalizes (using L2 norm) a Variable across the slowest dimension.  
-    Assumes the input Variable is contiguous.
-    Internally, all calculations are performed in fp32, regardless
-    of input/output precision.
-    """
-
-    @staticmethod
-    def forward(ctx, input):
-        """
-        input is assumed to be contiguous.
-        input may be either float or half precision. 
-        The precision of output will match the precision of input.
-        A float copy of the L2 norm across each slow dimension
-        is also created and saved for the backward pass.
-        """
-        # torch.cuda.nvtx.range_push("FusedNorm.forward, input.size() = {}"
-        #                            .format(input.size()))
-
-        if not input.is_contiguous():
-            raise RuntimeError("In FusedNorm.forward():  "
-                               "input with size {} is not contiguous"
-                               .format(input.size()))
-        if not input.is_cuda:
-            raise RuntimeError("In FusedNorm.forward():  input.is_cuda = False."
-                               "Currently, only cuda tensors are supported.")
- 
-        """
-        This is ok, new() treats a torch.Size object properly.
-        No need to unpack with an asterisk via new(*input.size()).
-        """
-        output = input.new(input.size()).contiguous()
-
-        """
-        For output with size (slow, faster, faster, ...fastest), we may want
-        norms with size (slow, 1, 1, ...1), so that if you want retrieve norms 
-        and apply the same normalizing factors to another Tensor "t" with the 
-        same size as output, "t/norms" will broadcast each element of norms 
-        across the corresponding slowest dim of t.
-        """
-        norm_size = (output.size(0),) + (1,)*(output.dim()-1)
-        norms = torch.cuda.FloatTensor(*norm_size).contiguous()
-        """
-        Beware:  If you call the following:
-        norms = torch.cuda.FloatTensor(norm_size).contiguous()
-        the constructor sees a tuple:
-        FloatTensor( (output_size(0),1,1,...) )
-        and creates a 1D tensor with values from the tuple:
-        [output_size(0),1,1,...].
-        """
-
-        # torch.cuda.synchronize()
-
-        # print("norms = ", norms)
-        # print("norms.size  () = ", norms.size())
-        # print("norms.stride() = ", norms.stride())
-
-        # print("type(input)  = ", type(input))
-        # print("type(output) = ", type(output))
-        # print("type(norms)  = ", type(norms))
-        # print( "input.data_ptr = {:x}".format(input.data_ptr()))
-        
-        apex._C.norm_fwd(input, output, norms)
-        # apex._C.norm_fwd(input.data, output.data, norms)
-
-        # torch.cuda.synchronize()
-
-        # print("norms in forward():  ", norms)
-
-        ctx.save_for_backward(input)
-
-        # save_for_backward can only save input or output tensors,
-        # so here's a hacky workaround to save the norms:
-        ctx.norms = norms
-
-        # torch.cuda.nvtx.range_pop()
-
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        """
-        grad_output is assumed to be contiguous.
-        grad_output may be either float or half precision.
-        The precision of grad_input will match the precision of grad_output.
-        """
-        # torch.cuda.nvtx.range_push("FusedNorm.backward, grad_output.size() = {}"
-        #                            .format(grad_output.size()))
-
-        if not grad_output.is_cuda:
-            raise RuntimeError("In FusedNorm.backward():  grad_output.is_cuda = False."
-                               "Currently, only cuda tensors are supported.")
-
-        savedInput, = ctx.saved_tensors
-        norms = ctx.norms
-
-        # better safe than sorry
-        grad_output_contig = grad_output.contiguous()
-        grad_input = grad_output_contig.new(grad_output.size()).contiguous()
-
-        apex._C.norm_bwd(grad_output_contig, grad_input, savedInput, norms)
-        # apex._C.norm_bwd(grad_output_contig.data, grad_input.data, savedInput.data, norms)
-
-        # torch.cuda.nvtx.range_pop()
-
-        # print("\n\n")
-        # print("grad_output.is_contiguous() = {:x}".format(grad_output.is_contiguous()))
-        # print(" grad_input.is_contiguous() = {:x}".format( grad_input.is_contiguous()))
-        # print(" savedInput.is_contiguous() = {:x}".format( savedInput.is_contiguous()))
-        # print("      norms.is_contiguous() = {:x}".format(      norms.is_contiguous()))
-        # print("\n\n")
-        # print("grad_output.data_ptr = {:x}".format(grad_output.data_ptr()))
-        # print(" grad_input.data_ptr = {:x}".format( grad_input.data_ptr()))
-        # print(" savedInput.data_ptr = {:x}".format( savedInput.data_ptr()))
-        # print("      norms.data_ptr = {:x}".format(      norms.data_ptr()))
-        # print("\n\n")
-        # print("grad_output in backward():  ", grad_output)
-        # print(" grad_input in backward():  ", grad_input)
-        # print(" savedInput in backward():  ", savedInput)
-        # print("      norms in backward():  ", norms)
-
-        return grad_input
diff --git a/apex_utils/apex/utils/__init__.py b/apex_utils/apex/utils/__init__.py
deleted file mode 100644
index 423daac..0000000
--- a/apex_utils/apex/utils/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .FusedNorm import FusedNorm
diff --git a/apex_utils/csrc/Module.cpp b/apex_utils/csrc/Module.cpp
deleted file mode 100644
index 27ae9d1..0000000
--- a/apex_utils/csrc/Module.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-#define PY_SSIZE_T_CLEAN
-#define ARG_OFFSET 5
-
-#include <Python.h>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <vector>
-#include <string>
-#include <cmath>
-#include <cassert>
-#include <iostream>
-
-// #define USE_NVTX
-#ifdef USE_NVTX
-#include "nvToolsExt.h"
-#endif
-
-//Meta-data format we will use
-#include <THCTensorInfo.cuh>
-
-//Cuda kernels
-#include <kernel.h>
-
-#define ERROR_MSG cout << "Error at " << __FILE__ << ":" << __LINE__ << "\n";
-
-using namespace std;
-TensorInfo<void, IDXTYPE> PyOb_2_tinfo(PyObject* tensor, float_types data_type)
-{
-  PyObject* PyStrides = PyObject_CallMethod(tensor, "stride", NULL);
-  if(PyStrides == NULL)
-  {
-    ERROR_MSG;
-    cout << "PyStrides = NULL" << endl;
-  }
-
-  PyObject* PySizes = PyObject_CallMethod(tensor, "size", NULL);
-  if(PySizes == NULL)
-  {
-    ERROR_MSG;
-    cout << "PySizes = NULL" << endl;
-  }
-
-  PyObject* PyDataPtr = PyObject_CallMethod(tensor, "data_ptr", NULL);
-  if(PyDataPtr == NULL)
-  {
-    ERROR_MSG;
-    cout << "PyDataPtr = NULL" << endl;
-  }
-
-  void* data_ptr = (void*) PyLong_AsLong(PyDataPtr);
-  Py_ssize_t ndims = PyList_GET_SIZE(PySizes);
-  //TODO put proper checking on ndims < MAX_CUTORCH_DIMS
-  IDXTYPE strides[MAX_CUTORCH_DIMS], sizes[MAX_CUTORCH_DIMS];
-
-  for(int i = 0; i < ndims; i++)
-  {
-    strides[i] = PyLong_AsLong(PyTuple_GetItem(PyStrides, i));
-    sizes[i] = PyLong_AsLong(PyTuple_GetItem(PySizes, i));
-  }
-
-  // Reference counts still behave strangely, but at least these appear to cap 
-  // the process' memory usage.
-  Py_DECREF(PyStrides);
-  Py_DECREF(PySizes);
-  Py_DECREF(PyDataPtr);
-
-  return TensorInfo<void, IDXTYPE>(data_ptr, ndims, sizes, strides, data_type);
-}
-
-vector<TensorInfo<void, IDXTYPE> > get_TInfos(PyObject* args)
-{
-  vector<TensorInfo<void, IDXTYPE> > info_vec;
-#ifdef DEBUG_ANY 
-  cout << "Processing " << PyTuple_GET_SIZE(args) << " arguments" << endl;
-#endif
-
-#ifdef CHECK_MEMLEAK
-  for(int iter = 0; iter < 1e7; iter++ )
-#endif
-    for(Py_ssize_t i=0; i<PyTuple_GET_SIZE(args); i++)
-    {
-      PyObject* pyTensor = PyTuple_GetItem(args, i);
-
-      // check type, only take if Tensor, Variable, or Parameter
-      string objType(pyTensor->ob_type->tp_name);
-
-      PyObject* pyObjTypeCall = PyObject_CallMethod(pyTensor, "type", NULL);
-      if(pyObjTypeCall == NULL)
-      {
-	ERROR_MSG;
-	cout << "For args item " << i << ", pyObjTypeCall = NULL" << endl;
-      }
-
-      // This gives a segfault:
-      // cout << "pyObjTypeCall direct conversion attempt = " << 
-      //          PyBytes_AsString(pyObjTypeCall) << endl;
-
-      PyObject* pyObjASCII = PyUnicode_AsASCIIString(pyObjTypeCall);
-      if(pyObjASCII == NULL)
-      {
-	ERROR_MSG;
-	cout << "For args item " << i << ", pyObjASCII = NULL " << endl;
-      }
-
-      // cout << "Py_REFCNT(pyObjTypeCall) = " << Py_REFCNT(pyObjTypeCall) << endl;
-      Py_DECREF(pyObjTypeCall);
-
-      string objTypeCall(PyBytes_AsString(pyObjASCII));
-
-      // cout << "Py_REFCNT(pyObjASCII) = " << Py_REFCNT(pyObjASCII) << endl;
-      Py_DECREF(pyObjASCII);
-
-#ifdef DEBUG_ANY
-      cout << "arg " << i << endl;
-      cout << "objType = " << objType << endl;
-      cout << "objTypeCall = " << objTypeCall << endl;
-#endif
-
-      if(objTypeCall == "torch.cuda.FloatTensor")
-#ifdef CHECK_MEMLEAK
-	if(iter == 0 )
-#endif
-	  info_vec.push_back(PyOb_2_tinfo(pyTensor, FLOAT));
-#ifdef CHECK_MEMLEAK
-	else
-	  info_vec[i] = PyOb_2_tinfo(pyTensor, FLOAT);
-#endif
-      else if(objTypeCall == "torch.cuda.HalfTensor")
-	info_vec.push_back(PyOb_2_tinfo(pyTensor, HALF));
-      // Could add double
-      else
-      {
-	ERROR_MSG;
-	cout << "For args item " << i << ", unsupported .type() found: "
-	     << objTypeCall << "\n"
-		"Supported types:\n"
-		"torch.cuda.FloatTensor\n"
-		"torch.cuda.HalfTensor\n"
-		"torch.autograd.variable.Variable containing FloatTensor\n"
-		"torch.autograd.variable.Variable containing HalfTensor\n"
-		"torch.nn.parameter.Parameter containing FloatTensor\n" 
-		"torch.nn.parameter.Parameter containing HalfTensor\n" 
-	     << endl;
-      }
-    }
-
-  // PyErr_SetString(PyExc_RuntimeError, "Exception set in  ");
-
-  return info_vec;
-}
-
-//Will extract all tensors in order. Assumes flat structure, tensors can not be wrapped in lists
-//tuples or any other iterator structure.
-static PyObject* norm_fwd(PyObject* self, PyObject* args)
-{
-#ifdef USE_NVTX
-nvtxRangePushA("norm_fwd C backend");
-#endif
-
-  vector<TensorInfo<void, IDXTYPE> >tensors = get_TInfos(args);
-
-#ifdef DEBUG_ANY
-  cout << "tensors.size() = " << tensors.size() << endl;
-#endif
-
-  IDXTYPE totalElems = 1;
-  for(int i = 0; i < tensors[0].dims; i++ )
-    totalElems *= tensors[0].sizes[i];
-  send_to_fwd
-  (
-    tensors[0], 
-    tensors[1], 
-    tensors[2], 
-    totalElems
-  );
-
-#ifdef USE_NVTX
-nvtxRangePop();
-#endif
-
-  Py_RETURN_NONE;
-}
-
-static PyObject* norm_bwd(PyObject* self, PyObject* args)
-{
-#ifdef USE_NVTX
-nvtxRangePushA("norm_bwd C backend");
-#endif
-
-  vector<TensorInfo<void, IDXTYPE> >tensors = get_TInfos(args);
-
-#ifdef DEBUG_ANY
-  cout << "tensors.size() = " << tensors.size() << endl;
-#endif
-
-  IDXTYPE totalElems = 1;
-  for(int i = 0; i < tensors[0].dims; i++ )
-    totalElems *= tensors[0].sizes[i];
-  send_to_bwd
-  (
-    tensors[0], 
-    tensors[1], 
-    tensors[2], 
-    tensors[3], 
-    totalElems
-  );
-
-#ifdef USE_NVTX
-nvtxRangePop();
-#endif
-
-  Py_RETURN_NONE;
-}
-
-
-
-//*******************PYTHON BOILER PLATE*******************
-static PyMethodDef apex_methods[] = {
-  {"norm_fwd", (PyCFunction) norm_fwd, METH_VARARGS, "Slowest-dim norm, forward pass."},
-  {"norm_bwd", (PyCFunction) norm_bwd, METH_VARARGS, "Slowest-dim norm, backward pass."},
-  {NULL, NULL, 0, NULL}
-};
-
-#if PY_MAJOR_VERSION >= 3
-
-//Module Definitions
-static struct PyModuleDef apex = {
-  PyModuleDef_HEAD_INIT, "apex._C", "Module to add CUDA extensions to Pytorch.", -1, apex_methods
-};
-//Initialization Function
-PyMODINIT_FUNC PyInit__C(void){
-
-  //Let's throw an error if we can't find pytorch.
-  PyImport_ImportModule("torch");
-  Py_Initialize();
-  return PyModule_Create(&apex);
-}
-#else
-PyMODINIT_FUNC initMODULE(void){
-  //Let's throw an error if we can't find pytorch.
-  PyImport_ImportModule("torch");
-  (void) Py_InitModule3("apex._C", apex, "A PyTorch Extension.");
-}
-
-#endif
-//*********************************************************
-
diff --git a/apex_utils/csrc/kernel.cu b/apex_utils/csrc/kernel.cu
deleted file mode 100644
index c755fde..0000000
--- a/apex_utils/csrc/kernel.cu
+++ /dev/null
@@ -1,304 +0,0 @@
-#include "../include/kernel.h"
-
-template<typename T> struct TtoInt { static const int test = -1; };
-template<> struct TtoInt<float> { static const int test = 0; }; 
-template<> struct TtoInt<half> { static const int test = 0; }; 
-template<> struct TtoInt<double> { static const int test = 0; }; 
-
-#if __CUDACC_VER_MAJOR__ >= 9
-#define __SHFL_DOWN(var, delta)  __shfl_down_sync(0xffffffff, var, delta)
-#else
-#define __SHFL_DOWN(var, delta)  __shfl_down(var, delta)
-#endif
-
-#if __CUDACC_VER_MAJOR__ >= 9
-#define __SYNCWARP __syncwarp()
-#else
-#define __SYNCWARP 
-#endif
-
-#define BLOCK 256
-
-using namespace std;
-
-template<typename T>
-__device__ __forceinline__ T block_reduce(T *x, T val) 
-{ 
-  int tidx = threadIdx.x;
-  if(blockDim.x >= 64)
-  {
-    x[tidx] = val;
-    __syncthreads();
-  }
-  
-  #pragma unroll
-  for(int i = (blockDim.x >> 1); i >= 64; i >>= 1) 
-  {
-    if( tidx < i )
-      x[tidx] += x[tidx+i]; // JoinOp
-    __syncthreads();
-  }
-
-  if(tidx < 32) 
-  {
-    T final;
-    if(blockDim.x >= 64)
-      final = x[tidx] + x[tidx+32]; 
-    else
-      final = val;
-    // __SYNCWARP();
-
-    #pragma unroll
-    for( int i = 16; i > 0; i >>= 1)
-      final += __SHFL_DOWN(final, i);
-
-    if(tidx == 0) 
-      x[0] = final;
-  }
-
-  __syncthreads();
-  return x[0];
-}
-
-template <typename T, typename IndexType>
-__global__ void norm_fwd_kernel
-(
-  TensorInfo<T, IndexType> input,
-  TensorInfo<T, IndexType> output,
-  TensorInfo<float, IndexType> norms,
-  IndexType totalElems,
-  IndexType rowSize
-)
-{
-  // We are norming each slowest-dim row of the tensor separately.
-  // For now, assign one block to each row.
-  IndexType tid = threadIdx.x;
-  IndexType row = blockIdx.x;
-  IndexType stride = blockDim.x;
-
-  // Logical index offset for this flattened row
-  IndexType rowStart = row*rowSize;
-
-  extern __shared__ float s[];
-  
-  float thread_sum = 0.f;
-  for(IndexType i = tid; i < rowSize; i += stride ) 
-  {
-    float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(input, i + rowStart)); 
-    thread_sum += val_f*val_f; // AccumOp, could do Kahan here
-  }
-
-  float result = block_reduce(s, thread_sum);
-
-  // if(tid == 0)
-  //   printf("norm for row %d = %f\n", row, sqrtf(result));
-  
-  if(tid == 0)
-    DEVICE_LINEAR_GET_F(norms, row) = sqrtf(result);
-
-  // Write data to output
-  for(IndexType i = tid; i < rowSize; i += stride ) 
-  {
-    float val_f = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(input, i + rowStart));
-    DEVICE_LINEAR_GET(output, i + rowStart) = ScalarConvert<float,T>::to(val_f*rsqrtf(result));
-  }
-}
-
-template <typename T, typename IndexType>
-__global__ void norm_bwd_kernel
-(
-  TensorInfo<T, IndexType> pLpOutput,
-  TensorInfo<T, IndexType> pLpInput,
-  TensorInfo<T, IndexType> savedInput,
-  TensorInfo<float, IndexType> savedNorms,
-  IndexType totalElems,
-  IndexType rowSize
-)
-{
-  // For now, assign one block to each row.
-  IndexType tid = threadIdx.x;
-  IndexType row = blockIdx.x;
-  IndexType stride = blockDim.x;
-
-  // Logical index offset for this flattened row
-  IndexType rowStart = row*rowSize;
-
-  extern __shared__ float s[];
-  
-  float thread_sum = 0.f;
-  for(IndexType i = tid; i < rowSize; i += stride ) 
-  {
-    float pLpOutputi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpOutput, i + rowStart)); 
-    float savedInputi = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedInput, i + rowStart)); 
-    thread_sum += pLpOutputi*savedInputi; // AccumOp, could do Kahan here
-  }
-
-  float result = block_reduce(s, thread_sum);
-
-  // if(tid == 0)
-  // {
-  //   printf
-  //   (
-  //     "blockDim.x = %ld\n"
-  //     "pLpOutput  data pointer = %lx\n" 
-  //     "pLpInput   data pointer = %lx\n" 
-  //     "savedInput data pointer = %lx\n" 
-  //     "savedNorms data pointer = %lx\n",
-  //     blockDim.x,
-  //     pLpOutput.data,
-  //     pLpInput.data,
-  //     savedInput.data,
-  //     savedNorms.data  
-  //   );
-  //   printf("result     for row %d = %f\n", row, result    );
-  //   printf("thread_sum for row %d = %f\n", row, thread_sum);
-  // }
-
-  // Could choose to save reciprocal of norm instead I suppose, but norms is probably
-  // more handy to keep around 
-  float rnorm = 1.f/DEVICE_LINEAR_GET_F(savedNorms, row);  
-  float rnorm3 = rnorm*rnorm*rnorm;
-   
-  // Write data to output.  We are reusing values that were loaded earlier, so there 
-  // is an optimization opportunity here (store values persistently).
-  for(IndexType j = tid; j < rowSize; j += stride ) 
-  {
-    float pLpOutputj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(pLpOutput, j + rowStart));  
-    float savedInputj = ScalarConvert<T, float>::to(DEVICE_LINEAR_GET(savedInput, j + rowStart));  
-    float pLpInputj = rnorm*pLpOutputj - rnorm3*savedInputj*result;
-    DEVICE_LINEAR_GET(pLpInput, j + rowStart) = ScalarConvert<float,T>::to(pLpInputj);
-  }
-}
-
-// template <typename T, typename IndexType>
-template <typename IndexType>
-void send_to_fwd
-(
-  TensorInfo<void, IndexType> input,
-  TensorInfo<void, IndexType> output,
-  TensorInfo<void, IndexType> norms,
-  IndexType totalElems
-)
-{
-#ifdef DEBUG_ANY
-  cout << "hello from send_to_fwd with input.type = " << input.type << endl;
-#endif
-
-  // Find logical size of each flattened slowest-dim row
-  IndexType rowSize = 1;
-  for(IndexType i = input.dims - 1; i > 0; i--)
-    rowSize *= input.sizes[i];
-
-  switch(input.type)
-  {
-    case FLOAT:
-#ifdef DEBUG_ANY
-      cout << "case FLOAT" << endl;
-#endif
-      norm_fwd_kernel<<<input.sizes[0], BLOCK, BLOCK*sizeof(float)>>>
-      (
-        *((TensorInfo<float,IndexType>*)&input),  // Safer:  Make a copy constructor that constructs
-        *((TensorInfo<float,IndexType>*)&output), // the typed version from a void, instead of a cast.
-        *((TensorInfo<float,IndexType>*)&norms),
-        totalElems,
-        rowSize
-      );
-      break;
-    case HALF:
-#ifdef DEBUG_ANY
-      cout << "case HALF" << endl;
-#endif
-      norm_fwd_kernel<<<input.sizes[0], BLOCK, BLOCK*sizeof(float)>>>
-      (
-        *((TensorInfo<half,IndexType>*)&input),
-        *((TensorInfo<half,IndexType>*)&output),
-        *((TensorInfo<float,IndexType>*)&norms),
-        totalElems,
-        rowSize
-      );
-      break;
-    default:
-      std::cout << "Unsupported input.type in send_to_fwd()" << std::endl;
-      cudaDeviceSynchronize();
-      exit(-1);
-  }
-#ifdef DEBUG_PROFILE
-  cudaDeviceSynchronize();
-#endif
-}
-
-// template <typename T, typename IndexType>
-template <typename IndexType>
-void send_to_bwd
-(
-  TensorInfo<void, IndexType> pLpOutput,
-  TensorInfo<void, IndexType> pLpInput,
-  TensorInfo<void, IndexType> savedInput,
-  TensorInfo<void, IndexType> savedNorms,
-  IndexType totalElems
-)
-{
-#ifdef DEBUG_ANY
-  cout << "Hello from send_to_bwd with pLpOutput.type = " << pLpOutput.type << endl;
-#endif
- 
-  // Find logical size of each flattened slowest-dim row
-  IndexType rowSize = 1;
-  for(IndexType i = savedInput.dims - 1; i > 0; i--)
-    rowSize *= savedInput.sizes[i];
-  
-  switch(pLpOutput.type)
-  {
-    case FLOAT:
-#ifdef DEBUG_ANY
-      cout << "case FLOAT" << endl;
-#endif
-      norm_bwd_kernel<<<pLpOutput.sizes[0], BLOCK, BLOCK*sizeof(float)>>>
-      (
-        *((TensorInfo<float,IndexType>*)&pLpOutput),
-        *((TensorInfo<float,IndexType>*)&pLpInput),
-        *((TensorInfo<float,IndexType>*)&savedInput),
-        *((TensorInfo<float,IndexType>*)&savedNorms),
-        totalElems, 
-        rowSize
-      );
-      break;
-    case HALF:
-#ifdef DEBUG_ANY
-      cout << "case HALF" << endl;
-#endif
-      norm_bwd_kernel<<<pLpInput.sizes[0], BLOCK, BLOCK*sizeof(float)>>>
-      (
-        *((TensorInfo<half,IndexType>*)&pLpOutput),
-        *((TensorInfo<half,IndexType>*)&pLpInput),
-        *((TensorInfo<half,IndexType>*)&savedInput),
-        *((TensorInfo<float,IndexType>*)&savedNorms),
-        totalElems,
-        rowSize
-      );
-      break;
-    default:
-      cout << "Unsupported pLpOutput.type in send_to_bwd()" << std::endl;
-      cudaDeviceSynchronize();
-      exit(-1);
-  }
-#ifdef DEBUG_PROFILE
-  cudaDeviceSynchronize();
-#endif
-}
-
-template void send_to_fwd<IDXTYPE>
-(
-  TensorInfo<void, IDXTYPE>, 
-  TensorInfo<void, IDXTYPE>, 
-  TensorInfo<void, IDXTYPE>, 
-  IDXTYPE
-);
-template void send_to_bwd<IDXTYPE>
-( 
-  TensorInfo<void, IDXTYPE>,
-  TensorInfo<void, IDXTYPE>,
-  TensorInfo<void, IDXTYPE>,
-  TensorInfo<void, IDXTYPE>,
-  IDXTYPE
-);
diff --git a/apex_utils/include/THCTensorInfo.cuh b/apex_utils/include/THCTensorInfo.cuh
deleted file mode 100644
index dec6a54..0000000
--- a/apex_utils/include/THCTensorInfo.cuh
+++ /dev/null
@@ -1,142 +0,0 @@
-#ifndef THC_TENSOR_INFO_INC
-#define THC_TENSOR_INFO_INC
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <assert.h>
-
-// Maximum number of dimensions allowed for cutorch
-#define MAX_CUTORCH_DIMS 10
-
-// Warning string for tensor arguments that are too large or have too
-// many dimensions
-#define CUTORCH_STR(X) #X
-#define CUTORCH_DIM_WARNING "tensor too large or too many (>" \
-  CUTORCH_STR(MAX_CUTORCH_DIMS) ") dimensions"
-
-enum float_types { FLOAT = 0 , HALF = 1, DOUBLE = 2 };
-
-// CUDA kernel argument that defines tensor layout
-template <typename T, typename IndexType>
-struct TensorInfo {
-  
-  TensorInfo(T* p,
-             int dim,
-             IndexType sz[MAX_CUTORCH_DIMS],
-             IndexType st[MAX_CUTORCH_DIMS]);
-  
-  TensorInfo(T* p,
-	     int dim,
-	     IndexType sz[MAX_CUTORCH_DIMS],
-	     IndexType st[MAX_CUTORCH_DIMS],
-	     float_types type);
-
-  //Good way to cast from another format
-  //template <TensorInfo<typename T2, typename I2> >
-  //TensorInfo(TensorInfo<T2, I2> &tinfo_in){
-  //  data = reinterpret_cast<T*>(tinfo_in.data);
-  //}
-  
-  T* data;
-  IndexType sizes[MAX_CUTORCH_DIMS];
-  IndexType strides[MAX_CUTORCH_DIMS];
-  int dims;
-  float_types type;
-};
-
-//Expand our combinations as convenient typedefs
-typedef TensorInfo<half, int> t_hi;
-typedef TensorInfo<half, long> t_hl;
-typedef TensorInfo<float, int> t_fi;
-typedef TensorInfo<float, long> t_fl;
-
-
-template <typename T, typename IndexType>
-TensorInfo<T, IndexType>::TensorInfo(T* p,
-                                     int dim,
-                                     IndexType sz[MAX_CUTORCH_DIMS],
-                                     IndexType st[MAX_CUTORCH_DIMS]) {
-  data = p;
-  dims = dim;
-  assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
-
-  for (int i = 0; i < dim; ++i) {
-    sizes[i] = sz[i];
-    strides[i] = st[i];
-  }
-}
-
-template<typename T, typename IndexType>
-TensorInfo<T, IndexType>::TensorInfo(T* p,
-	   int dim,
-	   IndexType sz[MAX_CUTORCH_DIMS],
-	   IndexType st[MAX_CUTORCH_DIMS],
-	   float_types _type){
-  data = p;
-  dims = dim;
-  assert(dims > 0 && dims < MAX_CUTORCH_DIMS);
-
-  for (int i = 0; i < dim; ++i) {
-    sizes[i] = sz[i];
-    strides[i] = st[i];
-  }
-  type=_type;
-}
-
-
-
-// Translate a linear index for the apply to a T* offset;
-// specialized on `Dims` to reduce nvcc compilation time
-template <typename T, typename IndexType, int Dims>
-struct IndexToOffset {
-  static __forceinline__ __host__ __device__ IndexType get(
-					   IndexType linearId,
-					   const TensorInfo<T, IndexType>& info) {
-    IndexType offset = 0;
-    
-    // Use static dims
-    for (int i = Dims - 1; i > 0; --i) {
-      for (int i = Dims - 1; i > 0; --i) {
-	offset += linearId % info.sizes[i] * info.strides[i];
-	linearId /= info.sizes[i];
-      }
-      
-      offset += linearId * info.strides[0];
-      return offset;
-    }
-  }
-};
-  
-
-  
-// For contiguous tensors, the offset = index
-template <typename T, typename IndexType>
-struct IndexToOffset<T, IndexType, -2> {
-  static __forceinline__ __host__ __device__ IndexType
-    get(IndexType linearId, const TensorInfo<T, IndexType>& info) {
-    return linearId;
-  }
-};
-
-template <typename T, typename IndexType>
-struct IndexToOffset<T, IndexType, -1> {
-  static __forceinline__ __host__ __device__ IndexType get(
-    IndexType linearId,
-    const TensorInfo<T, IndexType>& info) {
-
-    IndexType offset = 0;
-
-    // Use dynamic dims
-    for (int i = info.dims - 1; i >= 0; --i) {
-      IndexType curDimIndex = linearId % info.sizes[i];
-      IndexType curDimOffset = curDimIndex * info.strides[i];
-      offset += curDimOffset;
-
-      linearId /= info.sizes[i];
-    }
-
-    return offset;
-  }
-};
-
-#endif // THC_TENSOR_INFO_INC
diff --git a/apex_utils/include/kernel.h b/apex_utils/include/kernel.h
deleted file mode 100644
index 1a2244c..0000000
--- a/apex_utils/include/kernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-#include "THCTensorInfo.cuh"
-#include <iostream>
-#include <cstdio>
-#include <cassert>
-#include <cuda.h>
-#define Dims -2
-#define DEVICE_LINEAR_GET(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<T, IndexType, Dims>::get(INDEX, D_TENSOR)]
-#define DEVICE_LINEAR_GET_F(D_TENSOR, INDEX) D_TENSOR.data[IndexToOffset<float, IndexType, Dims>::get(INDEX, D_TENSOR)]
-
-// template <typename T, typename IndexType>
-// void send_to_kernel(
-//                     TensorInfo<T, IndexType> Input_1,
-//                     TensorInfo<T, IndexType> Input_2,
-//                     IndexType totalElems
-//                     );
-
-template<typename IndexType>
-void send_to_fwd
-(
-  TensorInfo<void, IndexType> input,  // Forward-pass input
-  TensorInfo<void, IndexType> output, // Forward-pass output
-  TensorInfo<void, IndexType> norms,
-  IndexType totalElems
-);
-
-template<typename IndexType>
-void send_to_bwd
-(
-  TensorInfo<void, IndexType> pLpOutput, // Incoming backward-pass gradients wrt forward-pass outputs
-  TensorInfo<void, IndexType> pLpInput,  // Result:  the gradients with respect to forward-pass inputs
-  TensorInfo<void, IndexType> savedInput,
-  TensorInfo<void, IndexType> norms,
-  IndexType totalElems
-);
-
-template <typename In, typename Out>
-struct ScalarConvert {
-  static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
-};
-
-#ifdef CUDA_HALF_TENSOR
-template <typename Out>
-struct ScalarConvert<half, Out> {
-  static __host__ __device__ __forceinline__ Out to(const half v) {
-#ifdef __CUDA_ARCH__
-    return (Out) __half2float(v);
-#else
-    return (Out) THC_half2float(v);
-#endif
-  }
-};
-
-template <typename In>
-struct ScalarConvert<In, half> {
-  static __host__ __device__ __forceinline__ half to(const In v) {
-#ifdef __CUDA_ARCH__
-    return __float2half((float) v);
-#else
-    return THC_float2half((float) v);
-#endif
-  }
-};
-
-template <>
-struct ScalarConvert<half, half> {
-  static __host__ __device__ __forceinline__ half to(const half v) {
-    return v;
-  }
-};
-
-#endif
-
-
-typedef int IDXTYPE;
diff --git a/finetune_classifier.py b/finetune_classifier.py
index dea4dfe..86d10c7 100755
--- a/finetune_classifier.py
+++ b/finetune_classifier.py
@@ -16,7 +16,7 @@
 from logreg_utils import train_logreg
 
 from fp16 import FP16_Module, FP16_Optimizer
-from apex.reparameterization import apply_weight_norm, remove_weight_norm
+from reparameterization import apply_weight_norm, remove_weight_norm
 
 import model as M
 from tqdm import tqdm
diff --git a/apex_utils/apex/RNN/RNNBackend.py b/model/RNN_utils/RNN/RNNBackend.py
similarity index 100%
rename from apex_utils/apex/RNN/RNNBackend.py
rename to model/RNN_utils/RNN/RNNBackend.py
diff --git a/apex_utils/apex/RNN/__init__.py b/model/RNN_utils/RNN/__init__.py
similarity index 100%
rename from apex_utils/apex/RNN/__init__.py
rename to model/RNN_utils/RNN/__init__.py
diff --git a/apex_utils/apex/RNN/cells.py b/model/RNN_utils/RNN/cells.py
similarity index 100%
rename from apex_utils/apex/RNN/cells.py
rename to model/RNN_utils/RNN/cells.py
diff --git a/apex_utils/apex/RNN/models.py b/model/RNN_utils/RNN/models.py
similarity index 100%
rename from apex_utils/apex/RNN/models.py
rename to model/RNN_utils/RNN/models.py
diff --git a/model/model.py b/model/model.py
index abeff81..e92eb06 100644
--- a/model/model.py
+++ b/model/model.py
@@ -5,7 +5,7 @@
 from torch.autograd import Variable
 import torch.nn.functional as F
 
-from apex import RNN
+from .RNN_utils import RNN
 from .transformer_utils import Embedding
 from .transformer import TransformerDecoder
 
diff --git a/model/sentiment_classifier.py b/model/sentiment_classifier.py
index 1d78c2b..3ff60e2 100644
--- a/model/sentiment_classifier.py
+++ b/model/sentiment_classifier.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 from itertools import chain
-from model import RNNFeaturizer, TransformerFeaturizer
+from .model import RNNFeaturizer, TransformerFeaturizer
 from .transformer_utils import GeLU
 
 class BinaryClassifier(nn.Module):
diff --git a/pretrain.py b/pretrain.py
index 5adfdc7..66af886 100755
--- a/pretrain.py
+++ b/pretrain.py
@@ -17,7 +17,7 @@
 import model as m
 from model import DistributedDataParallel as DDP
 
-from apex.reparameterization import apply_weight_norm, remove_weight_norm
+from reparameterization import apply_weight_norm, remove_weight_norm
 from configure_data import configure_data
 from learning_rates import AnnealingLR, WarmupLR, SlantedTriangularLR
 from arguments import add_general_args, add_model_args, add_unsupervised_data_args
diff --git a/apex_utils/apex/reparameterization/__init__.py b/reparameterization/__init__.py
similarity index 100%
rename from apex_utils/apex/reparameterization/__init__.py
rename to reparameterization/__init__.py
diff --git a/apex_utils/apex/reparameterization/reparameterization.py b/reparameterization/reparameterization.py
similarity index 100%
rename from apex_utils/apex/reparameterization/reparameterization.py
rename to reparameterization/reparameterization.py
diff --git a/apex_utils/apex/reparameterization/weight_norm.py b/reparameterization/weight_norm.py
similarity index 100%
rename from apex_utils/apex/reparameterization/weight_norm.py
rename to reparameterization/weight_norm.py
diff --git a/run_classifier.py b/run_classifier.py
index 7d9a84d..a2433d7 100755
--- a/run_classifier.py
+++ b/run_classifier.py
@@ -13,7 +13,7 @@
 import numpy as np
 import pandas as pd
 
-from apex.reparameterization import apply_weight_norm, remove_weight_norm
+from reparameterization import apply_weight_norm, remove_weight_norm
 
 from model import SentimentClassifier
 from configure_data import configure_data
diff --git a/setup.py b/setup.py
index 2745298..8bc4f68 100644
--- a/setup.py
+++ b/setup.py
@@ -2,39 +2,20 @@
 from setuptools import setup, find_packages
 import torch
 
-curdir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                      'apex_utils')
-os.chdir(curdir)
-
-if not torch.cuda.is_available():
-    print("Warning: Torch did not find available GPUs on this system.\n",
-          "If your intention is to cross-compile, this is not an error.")
-
 print("torch.__version__  = ", torch.__version__)
 TORCH_MAJOR = int(torch.__version__.split('.')[0])
 TORCH_MINOR = int(torch.__version__.split('.')[1])
 
 if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
-      raise RuntimeError("APEx requires Pytorch 0.4 or newer.\n" +
+      raise RuntimeError("Sentiment Discovery requires Pytorch 0.4 or newer.\n" +
                          "The latest stable release can be obtained from https://pytorch.org/")
 
 print("Building module.")
 setup(
-    name='apex', version='0.1',
+    name='sentiment_discovery', version='0.4',
 #    ext_modules=[cuda_ext,],
     description='PyTorch Extensions written by NVIDIA',
-    packages=find_packages(where='.',
-                           exclude=(
-                               "build",
-                               "csrc",
-                               "include",
-                               "tests",
-                               "dist",
-                               "docs",
-                               "tests",
-                               "examples",
-                               "apex.egg-info",
-                           )),
+    packages=find_packages(where='.'),
     install_requires=[
         "numpy",
         "pandas",
diff --git a/transfer.py b/transfer.py
index 37c73dc..2a64adb 100644
--- a/transfer.py
+++ b/transfer.py
@@ -18,7 +18,7 @@
 from logreg_utils import train_logreg
 
 from fp16 import FP16_Module, FP16_Optimizer
-from apex.reparameterization import apply_weight_norm, remove_weight_norm
+from reparameterization import apply_weight_norm, remove_weight_norm
 
 from model import RNNFeaturizer, TransformerFeaturizer
 from configure_data import configure_data