-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathapp.py
65 lines (51 loc) · 2.1 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import streamlit as st
import spacy
from annotated_text import annotated_text
@st.cache(show_spinner=False, allow_output_mutation=True, suppress_st_warning=True)
def load_models():
french_model = spacy.load("./models/fr/")
english_model = spacy.load("./models/en/")
models = {"en": english_model, "fr": french_model}
return models
def process_text(doc, selected_entities, anonymize=False):
tokens = []
for token in doc:
if (token.ent_type_ == "PERSON") & ("PER" in selected_entities):
tokens.append((token.text, "Person", "#faa"))
elif (token.ent_type_ in ["GPE", "LOC"]) & ("LOC" in selected_entities):
tokens.append((token.text, "Location", "#fda"))
elif (token.ent_type_ == "ORG") & ("ORG" in selected_entities):
tokens.append((token.text, "Organization", "#afa"))
else:
tokens.append(" " + token.text + " ")
if anonymize:
anonmized_tokens = []
for token in tokens:
if type(token) == tuple:
anonmized_tokens.append(("X" * len(token[0]), token[1], token[2]))
else:
anonmized_tokens.append(token)
return anonmized_tokens
return tokens
models = load_models()
selected_language = st.sidebar.selectbox("Select a language", options=["en", "fr"])
selected_entities = st.sidebar.multiselect(
"Select the entities you want to detect",
options=["LOC", "PER", "ORG"],
default=["LOC", "PER", "ORG"],
)
selected_model = models[selected_language]
text_input = st.text_area("Type a text to anonymize")
uploaded_file = st.file_uploader("or Upload a file", type=["doc", "docx", "pdf", "txt"])
if uploaded_file is not None:
text_input = uploaded_file.getvalue()
text_input = text_input.decode("utf-8")
anonymize = st.checkbox("Anonymize")
doc = selected_model(text_input)
tokens = process_text(doc, selected_entities)
annotated_text(*tokens)
if anonymize:
st.markdown("**Anonymized text**")
st.markdown("---")
anonymized_tokens = process_text(doc, selected_entities, anonymize=anonymize)
annotated_text(*anonymized_tokens)