Merge pull request #61 from datosgcba/fix/float-integrity

Conservar integridad de floats
datosgobar · Feb 1, 2019 · 29d6403 · 29d6403
2 parents 8128560 + 399f1d7
commit 29d6403
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -69,3 +69,4 @@ target/
 
 *.md~
 tests/output
+.idea
diff --git a/data_cleaner/data_cleaner.py b/data_cleaner/data_cleaner.py
@@ -93,7 +93,7 @@ def __init__(self, input_path, ignore_dups=False, **kwargs):
 
         # lee el CSV a limpiar
         elif input_path.endswith('.csv'):
-            self.df = pd.read_csv(input_path, **default_args)
+            self.df = pd.read_csv(input_path, dtype=str, **default_args)
 
         # lee el XLSX a limpiar
         elif input_path.endswith('.xlsx'):

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 chardet==3.0.4
-pandas>=0.20.3
+pandas==0.20.3
 geopandas==0.2.1
 parsley>=1.2
 arrow>=0.7.0

diff --git a/tests/input/to_clean_coordinates.csv b/tests/input/to_clean_coordinates.csv
@@ -0,0 +1,2 @@
+lat,long
+-34.6091725340289,-58.3842972567038
diff --git a/tests/test_data_cleaner.py b/tests/test_data_cleaner.py
@@ -19,6 +19,7 @@
 from data_cleaner import DataCleaner
 from data_cleaner.data_cleaner import DuplicatedField
 from rules.integration import rules
+import csv
 
 BASE_DIR = os.path.dirname(__file__)
 VCR = vcr.VCR(path_transformer=vcr.VCR.ensure_suffix('.yaml'),
@@ -61,6 +62,11 @@ def nan_to_empty_string_list(iterable):
     return [i if pd.notnull(i) else "" for i in iterable]
 
 
+def raw_csv(file_path):
+    with open(file_path, 'rb') as csvfile:
+        return [row for row in csv.reader(csvfile)]
+
+
 # @unittest.skip("skip")
 class DataCleanerIntegrationTestCase(unittest.TestCase):
     """Testea el funcionamiento integral del paquete."""
@@ -545,5 +551,19 @@ def test_get_api_response(self):
         self.assertEqual(res_test, res)
 
 
+class DataCleanerDataIntegrityTestCase(unittest.TestCase):
+    """Testea que la integridad de los datasets se mantenga."""
+    input_path = BASE_DIR + '/input/to_clean_coordinates.csv'
+
+    def test_float_integrity(self):
+        output_path = BASE_DIR + '/output/clean_coordinates.csv'
+
+        dc = DataCleaner(self.input_path)
+        dc.clean_file([], output_path)
+
+        raw_input = raw_csv(self.input_path)
+        raw_output = raw_csv(output_path)
+        self.assertEqual(raw_input, raw_output)
+
 if __name__ == '__main__':
     nose.run(defaultTest=__name__)