-
Notifications
You must be signed in to change notification settings - Fork 3
/
demo_creation.py
352 lines (309 loc) · 14.4 KB
/
demo_creation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
"""
python demo_creation.py
-u <username>
-p $HF_PASSWORD
-n <namepspace>
-b <playbook-id>
demonstration of creating a workspace and intents incrementally using APIs
Set HF_USERNAME and HF_PASSWORD as environment variables
"""
# *********************************************************************************************************************
# standard imports
import json
import os
import time
# third party imports
import click
import nltk
import humanfirst
@click.command()
@click.option('-u', '--username', type=str, default='',
help='HumanFirst username if not setting HF_USERNAME environment variable')
@click.option('-p', '--password', type=str, default='',
help='HumanFirst password if not setting HF_PASSWORD environment variable')
@click.option('-n', '--namespace', type=str, required=True, help='HumanFirst namespace')
@click.option('-w', '--wait_to_train', type=int, default=10, help='How long for NLU to train')
@click.option('-m', '--min_match_score', type=float, default=0.4, help='Minimum threshold to consider a match')
@click.option('-v', '--verbose', is_flag=True, default=False, help='Increase logging level')
@click.option('-d', '--dummy', is_flag=True, default=False, help='Dummy run - don\'t create humanfirst objects')
@click.option('-b', '--playbook', type=str, default='', help='If present will skip creation and try and load into this')
@click.option('-o', '--output_directory', type=str, default="./data", help='Where to write files')
def main(username: str,
password: int,
namespace: bool,
wait_to_train: int,
min_match_score: float,
verbose: bool = False,
dummy: bool = False,
playbook: str = '',
output_directory: str = ''):
"""Main"""
if verbose:
print('Verbose mode on')
if dummy:
print('This is a dummy run no objects will be created')
# here are some examples that the user has inputted
user_bootstrap_examples = get_user_examples()
# create a workspace structure for the intents and examples
workspace = get_workspace_skeleton(name="demo_creation_2",description="Should be showing new name")
# if you have any tags, they would go here so that intents and examples can reference them
# otherwise start by creating the intents
workspace["intents"] = create_hf_intents(user_bootstrap_examples)
# then create the examples referencing the intents
workspace["examples"] = create_hf_examples(user_bootstrap_examples)
# at this point this should validate using vscode or another json schema validating IDE/programme/website
# you can also test pushing it directly out into a json and uploading to HF
# must be UTF-8
workspace_file_uri = os.path.join(output_directory,"bootstrap_training.json")
with open(workspace_file_uri,mode="w",encoding="utf8") as bootstrap_file:
json.dump(workspace,fp=bootstrap_file,indent=2)
print(f'Created example workspace to upload at: {workspace_file_uri}')
if not dummy:
# authorisation
print('Authorising')
hf_api = humanfirst.apis.HFAPI(username=username, password=password)
if playbook == '':
# create the workspace/playbook
# calling this the returned playbook to differentiate it from workspace.
# this is the object with the ids created.
playbook = hf_api.post_playbook(namespace, "name not yet working")
playbook_id = playbook["metastorePlaybook"]["id"]
print(f'Created playbook: {playbook_id}')
else:
playbook_id = playbook
print(f'Using passed playbook: {playbook_id}')
# update the workspace with the training
print('Importing workspace into playbook:')
print(hf_api.import_intents(namespace,playbook_id,workspace_as_dict=workspace))
# get the NLU enginess for the workspace
nlu_engines = hf_api.get_nlu_engines(namespace,playbook_id)
# in this case ther is only going to be one (as we haven't created any others)
# and that is going to be humanfirst engine, so assume it's in the first position
# This is a tautism, you could get from the nlu_engines call the same info.
nlu_engine = hf_api.get_nlu_engine(namespace, playbook_id, nlu_engines[0]["id"])
print('NLU engine to train:')
print(nlu_engine)
# Trigger this - doesn't have a very meaningful response, None, or {} here, but with a code 200
hf_api.trigger_train_nlu(namespace,playbook_id,nlu_engine["id"])
print("Triggered training on NLU engine")
# Get the docs to classify
docs = get_docs()
# sentence split them
docs = sentencize_docs(docs)
print("Generated some test data")
# wait for model to train - later on example checking NLU ready
print("Starting wait for model to train")
if not dummy:
time.sleep(wait_to_train)
print("Wait complete")
if not dummy:
# for each doc do a batch predict on (or for all docs)
for doc in docs:
# print some metaddata of the doc
print(f'\nAnalysis of {doc["type"]} "{doc["filename"]}"')
print(f'Interviewer: {doc["author"]} interviewing {doc["interviewee"]}')
# get the predictions from huamnfirst
predictions = hf_api.batchPredict(doc["text"],namespace,playbook_id)
# loop through them printing out the text sentence with the score
for i,match in enumerate(predictions):
match_name = ''
match_confidence = match["matches"][0]["score"]
if match_confidence >= min_match_score:
match_name = match["matches"][0]["name"]
print(f'{doc["text"][i]:80} {match_name:>20}:{match_confidence:.2f},')
# Show changing one thing in workspace
if not dummy:
print(json.dumps(workspace,indent=2))
workspace["examples"][0]["text"] = "I changed only the first example"
print(hf_api.import_intents(namespace,playbook_id,workspace_as_dict=workspace,clear_intents=True))
def get_user_examples() -> list:
"""some example training phrases being used to boot strap the model
Note: need 2+ intents, minimum five examples
"""
user_bootstrap_examples = [
{
"label": "financial_concerns",
"examples": [
"I'm concerned about the financial situation",
"I'm worried about the balance sheet, and the company finances in general",
"I don't understand where the money has gone",
"The fiscal situation is difficult, we're in an unstable situation",
"We're going cash flow negative",
"I don't think we have enough capital funding"
]
},
{
"label": "travel_plans",
"examples": [
"I'm booking a holiday",
"I'm going to flee the country",
"I need to get away",
"I've got a ticket on the next flight to the Bahamas",
"Did you know that Algeria has no extradiction treaty with the UK",
"I'm going somewhere warm and sunny"
]
},
{
"label": "impropriety",
"examples": [
"My boss can't keep his hands off me",
"There was this issue at the staff party",
"Did you hear about Dave and Janet?",
"All the donations for Alfred's leaving do when missing",
"I'm sure someone is pilfering the petty cash",
"Craig is very handsy"
]
},
{
"label": "positivity",
"examples": [
"It's brilliant",
"Things are going so well",
"Can you believe how great our results are",
"I love it here",
"Fantastic",
"It is great"
]
}
]
return user_bootstrap_examples
def sentencize_docs(docs: list) -> list:
"""sentencize the docs ready for labelling"""
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
punkt_tokenizer = nltk.tokenize.PunktSentenceTokenizer()
# split the sentences
for doc in docs:
doc["text"] = punkt_tokenizer.tokenize(doc["text"])
# clean up white space how you want - you just need to be consistent with what you store locally
# with what you ask humanfirst to annotate as separate sentences.
for i, text in enumerate(doc["text"]):
assert isinstance(text, str)
doc["text"][i] = text.strip()
return docs
def get_workspace_skeleton(name:str, description: str, color: str = "#ff33da") -> dict:
"""create skeletons for the objects in humanfirst.py as full JSON for readibility/replication outside of python"""
workspace_skeleton = {
"$schema": "https://docs.humanfirst.ai/hf-json-schema.json",
"name": name,
"description": description,
"color": color,
"examples": [],
"tags": [],
"intents": [],
"entities": []
}
return workspace_skeleton
def create_hf_intents(user_bootstrap_examples: list) -> list:
"""Create HF intents"""
intent_skeleton = {
"id": "", # id is what actually links examples to intents
"name": "", # duplicates are allowed
"metadata": {}, # single level key value pairs all data as strings
"tags": [], # a list of HF tag objects
"parent_intent_id": None # optional link to parent id
}
intents = []
for i, input_example in enumerate(user_bootstrap_examples):
hf_intent = intent_skeleton.copy()
# build any id you want must not overlap with others
hf_intent["id"] = f'intent-id-{i}-{input_example["label"]}'
hf_intent["name"] = input_example["label"]
# Parents you must create first, then the children if you want a hierarchy
# add tags and metadata at the intent level as you wish.
# remove the parent_intent_id for json validation if not required
del hf_intent["parent_intent_id"]
intents.append(hf_intent)
return intents
def create_hf_examples(user_bootstrap_examples: list) -> list:
"""create hf examples"""
example_skeleton = {
"id": "",
"text": "", # the text of the example
"context": {}, # HFContext, optional A HFContext object defining what document type the example came from
"intents": [], # intent to add a label goes here.
"tags": [], # utterance level tags
"metadata": {}
}
examples = []
for i, input_example in enumerate(user_bootstrap_examples):
for j, text in enumerate(input_example["examples"]):
hf_example = example_skeleton.copy()
# build any id you want must not overlap with others
hf_example["id"] = f'example-id-{i}-{input_example["label"]}-{j}'
hf_example["text"] = text # The text
# no context here as these are single examples, not necessarily connected to a doc.
hf_example["context"] = {}
# we want to link it to the id of the intent to train
# it is a list but generally only 1 intent should be present depending on your nlu.
# we provide python sdk which indexes the ids in objects for you.
# here we are linking just by using the same format of id as we used in create_hf_intents
hf_example["intents"] = [
{
'intent_id': f'intent-id-{i}-{input_example["label"]}'
}
]
# add tags and metadata as wish
examples.append(hf_example)
return examples
def get_tag_skeleton() -> dict:
"""Return a tag skeleton"""
tag_skeleton = {
"id": "", # unique id for tag
"name": "", # name of tag that will be displayed in HF studio
# str, optional a hex code starting with # for a color to display the tag in eg #ff33da (a bright pink)
"color": ""
}
return tag_skeleton.copy()
def get_context_skeleton() -> dict:
"""Return a skeleton for a context object linking examples together into a document"""
context_skeleton = {
"context_id": "", # unique id for context object - i.e the document id or similar to link
# always conversation for now if present. New document types coming
"type": "conversation",
"role": "" # client or expert
}
return context_skeleton.copy()
def get_docs() -> list:
"""some example docs to annotate"""
docs = [
{"filename": "CMO-2023-09-18 18:04:00",
"author": "Gary McGibbons (Intern)",
"interviewee": "Sarah Tribbins (CMO)",
"type": "transcript",
"text": """
This was a very boring year, nothing happened.
I become some bored and concerned that I started sleeping in meetings.
I can't imagine anything interesting happened at all.
Apart from the CEO's affair with his secretary.
And us loosing $50k at Aintree on the company away day.
Other than that very dull.
"""},
{"filename": "CFO-2023-10-19 10:45:04",
"author": "Gary McGibbons (Intern)",
"interviewee": "Harvey Happenstance (CFO)",
"type": "transcript",
"text": """
I'm really worried about the finances since the disaster at the company away today.
People's conduct was highly in appropriate.
I don't know what we're going to do to stabilise the balance sheet.
I'm on the express train to Bristol in the morning.
I'm going to try and find a friendly banker.
"""},
{"filename": "CFO-2023-09-19 12:04:00",
"author": "Gary McGibbons (Intern)",
"interviewee": "Lisa Lords (CEO)",
"type": "transcript",
"text": """
Everything is glorious.
Literally nothing could be better.
I'm ecstatic about out new product line, it is so nearly ready for market.
I've seen the books and they are looking great, we'll be in the black soon.
I don't think there are any issues.
"""}
]
return docs
if __name__ == '__main__':
main() # pylint: disable=no-value-for-parameter