-
Notifications
You must be signed in to change notification settings - Fork 3
/
csv_to_entities.py
109 lines (89 loc) · 3.16 KB
/
csv_to_entities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
python csv_to_entites.py
accept in a UTF8 csv file and a delimiter and converts it to entities
your csv should be of the format
if has a header line -h will remove it
filename without .csv is entity name
col0 = key_value
col1 = onward are synonyms
for example this is the way Dialogflow ES exports an entity
also works with CX exported lists
"""
# *********************************************************************************************************************
# standard imports
import json
import os
# 3rd party imports
import pandas
import click
@click.command()
@click.option('-f', '--filename', type=str, required=True,
help='Input File Path')
@click.option('-d', '--delimiter', type=str, required=False, default=",",
help='Delimiter for the csv file')
@click.option('-l', '--language', type=str, required=False, default="en",
help='Language of entities default: en')
@click.option('-h', '--header', is_flag=True, required=False, default=False,
help='If passed the first row of the CSV will be treated as column names')
def main(filename: str,
delimiter: str,
language:str,
header: bool
) -> None:
"""Main Function"""
# read the input csv with columns 0,1,2 etc
assert filename.endswith(".csv")
df = pandas.read_csv(filename, encoding='utf8',header=None, delimiter=delimiter)
# Drop header if present
if header:
df = df.iloc[1:]
df.reset_index(inplace=True)
# assert
assert df.shape[0] >= 1
assert df.shape[1] >= 2
# values
values = []
# check if column containing keys is having only unique values
assert df[0].is_unique
# iterate through dataframe
for i in range(df.shape[0]):
# work out how many synonyms we have (2nd column (column 1) onward until NaN)
synonyms = []
for j in range(1,df.shape[1],1):
# exit when come to first NaN value
if pandas.isna(df.loc[i,j]):
break
synonym = {
"value": df.loc[i,j]
}
synonyms.append(synonym.copy())
# add a value to the entity with those synonyms
value = {
"id":f'entity-value-{df.loc[i,0]}',
"key_value": df.loc[i,0],
"language": language,
"synonyms": synonyms
}
values.append(value.copy())
# create the entity add the values
entity_name = os.path.basename(filename).replace(".csv","")
entity = {
"id": f'entity-{entity_name}',
"name": entity_name,
"values": values
}
# add all entities to workspace as a list
workspace = {
"$schema": "https://docs.humanfirst.ai/hf-json-schema.json",
"entities": [entity]
}
# write to json
output_filename = filename.replace(".csv",".json")
assert output_filename != filename
print(df)
print(json.dumps(workspace,indent=2))
with open(output_filename,mode="w",encoding="utf8") as output_file:
json.dump(workspace,output_file,indent=2)
print(f"Wrote to: {output_filename}")
if __name__ == '__main__':
main() # pylint: disable=no-value-for-parameter