Skip to content

Commit

Permalink
Merge pull request #61 from datosgcba/fix/float-integrity
Browse files Browse the repository at this point in the history
Conservar integridad de floats
  • Loading branch information
abenassi authored Feb 1, 2019
2 parents 8128560 + 399f1d7 commit 29d6403
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,4 @@ target/

*.md~
tests/output
.idea
2 changes: 1 addition & 1 deletion data_cleaner/data_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def __init__(self, input_path, ignore_dups=False, **kwargs):

# lee el CSV a limpiar
elif input_path.endswith('.csv'):
self.df = pd.read_csv(input_path, **default_args)
self.df = pd.read_csv(input_path, dtype=str, **default_args)

# lee el XLSX a limpiar
elif input_path.endswith('.xlsx'):
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
chardet==3.0.4
pandas>=0.20.3
pandas==0.20.3
geopandas==0.2.1
parsley>=1.2
arrow>=0.7.0
Expand Down
2 changes: 2 additions & 0 deletions tests/input/to_clean_coordinates.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
lat,long
-34.6091725340289,-58.3842972567038
20 changes: 20 additions & 0 deletions tests/test_data_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from data_cleaner import DataCleaner
from data_cleaner.data_cleaner import DuplicatedField
from rules.integration import rules
import csv

BASE_DIR = os.path.dirname(__file__)
VCR = vcr.VCR(path_transformer=vcr.VCR.ensure_suffix('.yaml'),
Expand Down Expand Up @@ -61,6 +62,11 @@ def nan_to_empty_string_list(iterable):
return [i if pd.notnull(i) else "" for i in iterable]


def raw_csv(file_path):
with open(file_path, 'rb') as csvfile:
return [row for row in csv.reader(csvfile)]


# @unittest.skip("skip")
class DataCleanerIntegrationTestCase(unittest.TestCase):
"""Testea el funcionamiento integral del paquete."""
Expand Down Expand Up @@ -545,5 +551,19 @@ def test_get_api_response(self):
self.assertEqual(res_test, res)


class DataCleanerDataIntegrityTestCase(unittest.TestCase):
"""Testea que la integridad de los datasets se mantenga."""
input_path = BASE_DIR + '/input/to_clean_coordinates.csv'

def test_float_integrity(self):
output_path = BASE_DIR + '/output/clean_coordinates.csv'

dc = DataCleaner(self.input_path)
dc.clean_file([], output_path)

raw_input = raw_csv(self.input_path)
raw_output = raw_csv(output_path)
self.assertEqual(raw_input, raw_output)

if __name__ == '__main__':
nose.run(defaultTest=__name__)

0 comments on commit 29d6403

Please sign in to comment.