diff --git a/README.md b/README.md
index ec39e29..f004d04 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ Around 10% faster than solution for single file and a lot faster for multiple fi
 from scipy.stats import entropy
 import numpy as np
 
-def entropy1(labels, base=None):
+def entropy(labels, base=None):
   labels = np.frombuffer(labels, dtype=np.uint8)
 
   value,counts = np.unique(labels, return_counts=True)
@@ -27,6 +27,11 @@ It took 10522.091444253922 seconds to complete the processing of all 200k malwar
 The malware was stored on network attached storage, which has greatly impacted the I/O performance. 
 
 
+## Testing
+
+Currently, tests cannot be performed on the Github actions as there is no Nvidia GPU available.
+If it will be possible, I will create a self-hosted runner in the future.
+
 ## Remarks
 
 Code is not optimized and cleaned yet.
\ No newline at end of file
diff --git a/entro.py b/entro.py
index 26fefff..14cd120 100644
--- a/entro.py
+++ b/entro.py
@@ -15,8 +15,6 @@ def calculate_histogram(data, hist_out):
     tx = cuda.threadIdx.x
 
     local_hist[tx] = 0
-    cuda.syncthreads()
-
 
     idx = cuda.grid(1)
     stride = cuda.gridsize(1)
@@ -30,16 +28,19 @@ def calculate_histogram(data, hist_out):
 
 
 @cuda.jit
-def calculate_entropy(hist, total_pixels, entropy_out):
+def calculate_entropy(hist, data_size, entropy_out):
     idx = cuda.grid(1)
     stride = cuda.gridsize(1)
     for i in range(idx, hist.shape[0], stride):
-        prob = hist[i] / total_pixels
+        prob = hist[i] / data_size
         if prob != 0:
             entropy_out[i] = -prob * math.log2(prob)
         else:
+            # Some small, not important number
             entropy_out[i] = -0.000001 * math.log2(0.000001)
 
+
+# TODO: implement it properly.
 @cuda.jit
 def sum_array(arr, result):
     local_mem = cuda.shared.array(256, dtype=np.float32)
@@ -65,24 +66,21 @@ def sum_array(arr, result):
 
     cuda.syncthreads()
 
-def entropy_with_cuda(data):
 
-    total_pixels = len(data)
+def entropy_with_cuda(data):
+    
+    data_size = len(data)
     data_gpu = cuda.to_device(np.frombuffer(data, dtype=np.uint8))
-
-    cuda.synchronize()
-    hist_host = np.zeros(256, dtype=np.uint32)
-    #hist_out = cuda.device_array(256, dtype=np.uint32)
-    # Initialize histogram array to zeros
-    #cuda.device_array_like(hist_out, fill_value=0)
     
+    #Initialize hist with 0. For some reason numba.cuda.device_array didnt work as expected.
+    hist_host = np.zeros(256, dtype=np.uint32)
     hist_out = cuda.to_device(hist_host)
-    cuda.synchronize()
+    
     threadsperblock_hist = 256
     blockspergrid_hist = min((len(data) + (threadsperblock_hist - 1)) // threadsperblock_hist, 1024)
     calculate_histogram[blockspergrid_hist, threadsperblock_hist](data_gpu, hist_out)
        
-
+    
     del data_gpu
     cuda.synchronize()
 
@@ -91,31 +89,15 @@ def entropy_with_cuda(data):
 
     threadsperblock_entropy = 256
     blockspergrid_entropy = min((hist_out.size + (threadsperblock_entropy - 1)) // threadsperblock_entropy, 1024)
-    calculate_entropy[blockspergrid_entropy, threadsperblock_entropy](hist_out, total_pixels, entropy_out_gpu)
+    calculate_entropy[blockspergrid_entropy, threadsperblock_entropy](hist_out, data_size, entropy_out_gpu)
 
     cuda.synchronize()
     del hist_out
 
-    result = cuda.device_array(blockspergrid_entropy, dtype=np.float32)
-    
-    cuda.synchronize()
-
-    sum_array[blockspergrid_entropy, threadsperblock_entropy](entropy_out_gpu, result)
-
-
-    cuda.synchronize()
-    del entropy_out_gpu
-
- 
-    entropy_sum = result.copy_to_host()
-
-    del result
+    local_entropies = entropy_out_gpu.copy_to_host()
 
-
-    cuda.synchronize()
-    
     #todo: remove sum() make it parrarel
-    return entropy_sum.sum()
+    return local_entropies.sum()
 
 def is_supported_cuda():
     return cuda.is_available() and cuda.detect()
\ No newline at end of file
diff --git a/test_entro.py b/test_entro.py
index 25ede29..d906153 100644
--- a/test_entro.py
+++ b/test_entro.py
@@ -4,9 +4,10 @@
 import math
 from scipy.stats import entropy
 from numba.core.errors import NumbaPerformanceWarning
+import time
 
 # Functions to test
-from entro import calculate_histogram, calculate_entropy
+from entro import calculate_histogram, calculate_entropy, entropy_with_cuda
 
 class TestCalculateHistogram(unittest.TestCase):
     
@@ -91,5 +92,39 @@ def test_random_histograms(self):
             del entropy_out_gpu
 
 
+class TestEntropyWithCUDA(unittest.TestCase):
+    def test_entropy_calculation(self):
+        # Test case for correctness of entropy calculation
+        for i in range(1, 100):
+            data = np.random.randint(0, 256, size=1000, dtype=np.uint8)
+            entropy_expected = self.calculate_entropy_numpy(data)
+            entropy_actual = entropy_with_cuda(data)
+            np.testing.assert_almost_equal(entropy_actual, entropy_expected, decimal=3)
+
+    def test_performance(self):
+        # Test case for performance
+        for i in range(0, 100):
+            data = np.random.randint(0, 256, size=10**6, dtype=np.uint8)
+            # Measure time for CUDA entropy calculation
+            start_time = time.time()
+            entropy_actual = entropy_with_cuda(data)
+            cuda_time = time.time() - start_time
+
+            # Measure time for NumPy entropy calculation
+            start_time = time.time()
+            entropy_expected = self.calculate_entropy_numpy(data)
+            numpy_time = time.time() - start_time
+
+            # CUDA needs to be at least twice as fast
+            np.testing.assert_almost_equal(entropy_actual, entropy_expected, decimal=3)
+            self.assertTrue(cuda_time*2 < numpy_time)
+
+    def calculate_entropy_numpy(self, data):
+        hist, _ = np.histogram(data, bins=256, range=[0, 256])
+        hist = hist / len(data)
+        entropy = -np.sum(hist * np.log2(hist + (hist == 0)))
+        return entropy
+
+
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file