Skip to content

Commit

Permalink
Merge pull request #51 from snipsco/release/0.57.0
Browse files Browse the repository at this point in the history
Release 0.57.0
  • Loading branch information
adrienball authored Jun 8, 2018
2 parents f80525d + 076faa8 commit 155b813
Show file tree
Hide file tree
Showing 22 changed files with 1,568 additions and 1,514 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
# Changelog
All notable changes to this project will be documented in this file.

## [0.57.0] - 2018-06-08
### Changed
- Improve matching of synonyms
- Improve caching strategy for builtin entity parsing
- Improve intent classification
- Bump model version to `0.15.0`
- Bump `snips-nlu-ontology` to `0.57.0`

## [0.56.1] - 2018-05-18
### Changed
- Improve calibration of intent classification probabilities
Expand Down Expand Up @@ -45,6 +53,7 @@ All notable changes to this project will be documented in this file.
- Rename python package to `snips_nlu_rust`


[0.57.0]: https://github.com/snipsco/snips-nlu-rs/compare/0.56.1...0.57.0
[0.56.1]: https://github.com/snipsco/snips-nlu-rs/compare/0.56.0...0.56.1
[0.56.0]: https://github.com/snipsco/snips-nlu-rs/compare/0.55.2...0.56.0
[0.55.2]: https://github.com/snipsco/snips-nlu-rs/compare/0.55.1...0.55.2
Expand Down
2,827 changes: 1,406 additions & 1,421 deletions data/tests/configurations/trained_assistant.json

Large diffs are not rendered by default.

Binary file modified data/tests/zip_files/sample_config.zip
Binary file not shown.
4 changes: 2 additions & 2 deletions snips-nlu-ffi/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "snips-nlu-ffi"
version = "0.56.1"
version = "0.57.0"
authors = [
"Kevin Lefevre <[email protected]>",
"Thibaut Lorrain <[email protected]>"
Expand All @@ -9,7 +9,7 @@ authors = [
[dependencies]
ffi-utils = { git = "https://github.com/snipsco/snips-utils-rs", rev = "b1f4af3" }
snips-nlu-lib = { path = "../snips-nlu-lib" }
snips-nlu-ontology-ffi-macros = { git = "https://github.com/snipsco/snips-nlu-ontology", tag = "0.55.0" }
snips-nlu-ontology-ffi-macros = { git = "https://github.com/snipsco/snips-nlu-ontology", tag = "0.57.0" }
failure = "0.1"
lazy_static = "1.0"
libc = "0.2"
Expand Down
2 changes: 1 addition & 1 deletion snips-nlu-ffi/kotlin/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ buildscript {

apply plugin: 'kotlin'

version = "0.56.1"
version = "0.57.0"
group = "ai.snips"

repositories {
Expand Down
4 changes: 2 additions & 2 deletions snips-nlu-ffi/python/snips-nlu-python-ffi/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "snips-nlu-python-ffi"
version = "0.56.1"
version = "0.57.0"
authors = ["Adrien Ball <[email protected]>"]

[lib]
Expand All @@ -10,4 +10,4 @@ crate-type = ["cdylib"]
[dependencies]
libc = "0.2"
ffi-utils = { git = "https://github.com/snipsco/snips-utils-rs", rev = "b1f4af3" }
snips-nlu-ffi = { git = "https://github.com/snipsco/snips-nlu-rs", tag = "0.56.1" }
snips-nlu-ffi = { git = "https://github.com/snipsco/snips-nlu-rs", tag = "0.57.0" }
2 changes: 1 addition & 1 deletion snips-nlu-ffi/python/snips_nlu_rust/__version__
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.56.1
0.57.0
2 changes: 1 addition & 1 deletion snips-nlu-ffi/swift/SnipsNlu/Dependencies/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

set -e

VERSION="0.56.1"
VERSION="0.57.0"
SYSTEM=$(echo $1 | tr '[:upper:]' '[:lower:]')
LIBRARY_NAME=libsnips_nlu_ffi
LIBRARY_NAME_A=${LIBRARY_NAME}.a
Expand Down
8 changes: 5 additions & 3 deletions snips-nlu-lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "snips-nlu-lib"
version = "0.56.1"
version = "0.57.0"
authors = [
"Thibaut Lorrain <[email protected]>",
"Kevin Lefevre <[email protected]>"
Expand All @@ -11,13 +11,15 @@ description = "Rust implementation of Snips NLU"
[dependencies]
snips-nlu-resources-packed = { path = "../snips-nlu-resources-packed" }
crfsuite = { git = "https://github.com/snipsco/crfsuite-rs", rev = "b18d95c" }
snips-nlu-ontology = { git = "https://github.com/snipsco/snips-nlu-ontology", tag = "0.55.0" }
snips-nlu-ontology-parsers = { git = "https://github.com/snipsco/snips-nlu-ontology", tag = "0.55.0" }
snips-nlu-ontology = { git = "https://github.com/snipsco/snips-nlu-ontology", tag = "0.57.0" }
snips-nlu-ontology-parsers = { git = "https://github.com/snipsco/snips-nlu-ontology", tag = "0.57.0" }
snips-nlu-utils = { git = "https://github.com/snipsco/snips-nlu-utils", tag = "0.6.1" }
dinghy-test = "0.3"
failure = "0.1"
base64 = "0.9"
itertools = { version = "0.7", default-features = false }
lazy_static = "1.0"
lru-cache = "0.1.1"
serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
Expand Down
2 changes: 1 addition & 1 deletion snips-nlu-lib/examples/trained_assistant.json

Large diffs are not rendered by default.

92 changes: 92 additions & 0 deletions snips-nlu-lib/src/builtin_entity_parsing.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
use std::collections::HashMap;
use std::sync::{Arc, Mutex};

use lru_cache::LruCache;

use snips_nlu_ontology::{BuiltinEntityKind, BuiltinEntity, Language};
use snips_nlu_ontology_parsers::BuiltinEntityParser;


pub struct CachingBuiltinEntityParser {
parser: BuiltinEntityParser,
cache: Mutex<EntityCache>,
}

impl CachingBuiltinEntityParser {
pub fn new(lang: Language, cache_capacity: usize) -> Self {
CachingBuiltinEntityParser {
parser: BuiltinEntityParser::new(lang),
cache: Mutex::new(EntityCache::new(cache_capacity)),
}
}

pub fn extract_entities(
&self,
sentence: &str,
filter_entity_kinds: Option<&[BuiltinEntityKind]>,
use_cache: bool,
) -> Vec<BuiltinEntity> {
let lowercased_sentence = sentence.to_lowercase();
if !use_cache {
return self.parser.extract_entities(&lowercased_sentence, filter_entity_kinds);
}
let cache_key = CacheKey {
input: lowercased_sentence,
kinds: filter_entity_kinds
.map(|entity_kinds| entity_kinds.to_vec())
.unwrap_or_else(|| vec![]),
};

self.cache
.lock()
.unwrap()
.cache(&cache_key,
|cache_key| self.parser.extract_entities(&cache_key.input, filter_entity_kinds))
}
}

struct EntityCache(LruCache<CacheKey, Vec<BuiltinEntity>>);

impl EntityCache {
fn new(capacity: usize) -> Self {
EntityCache(LruCache::new(capacity))
}

fn cache<F: Fn(&CacheKey) -> Vec<BuiltinEntity>>(
&mut self,
key: &CacheKey,
producer: F,
) -> Vec<BuiltinEntity> {
let cached_value = self.0.get_mut(key).map(|a| a.clone());
if let Some(value) = cached_value {
return value;
}
let value = producer(key);
self.0.insert(key.clone(), value.clone());
value
}
}

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct CacheKey {
input: String,
kinds: Vec<BuiltinEntityKind>,
}

pub struct BuiltinEntityParserFactory;

impl BuiltinEntityParserFactory {
pub fn get(lang: Language) -> Arc<CachingBuiltinEntityParser> {
lazy_static! {
static ref CACHED_PARSERS: Mutex<HashMap<Language, Arc<CachingBuiltinEntityParser>>> =
Mutex::new(HashMap::new());
}

CACHED_PARSERS
.lock()
.unwrap()
.entry(lang)
.or_insert_with(|| Arc::new(CachingBuiltinEntityParser::new(lang, 1000)))
.clone()
}
}
51 changes: 7 additions & 44 deletions snips-nlu-lib/src/intent_classifier/featurizer.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
use std::collections::{HashMap, HashSet};
use std::ops::Range;
use std::str::FromStr;
use std::sync::Arc;

use itertools::Itertools;
use ndarray::prelude::*;

use builtin_entity_parsing::{BuiltinEntityParserFactory, CachingBuiltinEntityParser};
use configurations::FeaturizerConfiguration;
use errors::*;
use language::FromLanguage;
use nlu_utils::language::Language as NluUtilsLanguage;
use nlu_utils::string::{normalize, substring_with_char_range};
use nlu_utils::string::normalize;
use nlu_utils::token::{compute_all_ngrams, tokenize_light};
use resources::stemmer::{StaticMapStemmer, Stemmer};
use resources::word_clusterer::{StaticMapWordClusterer, WordClusterer};
use snips_nlu_ontology::{BuiltinEntityKind, Language};
use snips_nlu_ontology_parsers::BuiltinEntityParser;

pub struct Featurizer {
best_features: Vec<usize>,
Expand All @@ -25,7 +24,7 @@ pub struct Featurizer {
word_clusterer: Option<StaticMapWordClusterer>,
stemmer: Option<StaticMapStemmer>,
entity_utterances_to_feature_names: HashMap<String, Vec<String>>,
builtin_entity_parser: Arc<BuiltinEntityParser>,
builtin_entity_parser: Arc<CachingBuiltinEntityParser>,
language: Language,
}

Expand All @@ -35,7 +34,7 @@ impl Featurizer {
let vocabulary = config.tfidf_vectorizer.vocab;
let language = Language::from_str(&*config.language_code)?;
let idf_diag = config.tfidf_vectorizer.idf_diag;
let builtin_entity_parser = BuiltinEntityParser::get(language);
let builtin_entity_parser = BuiltinEntityParserFactory::get(language);
let word_clusterer = config
.config
.word_clusters_name
Expand Down Expand Up @@ -100,24 +99,14 @@ impl Featurizer {
&*normalized_stemmed_tokens,
&self.entity_utterances_to_feature_names,
);
let builtin_entities = self.builtin_entity_parser.extract_entities(query, None);
let entities_ranges = builtin_entities
.iter()
.sorted_by_key(|ent| ent.range.start)
.iter()
.map(|ent| ent.range.clone())
.collect();
let builtin_entities = self.builtin_entity_parser.extract_entities(query, None, true);
let builtin_entities_features: Vec<String> = builtin_entities
.iter()
.map(|ent| get_builtin_entity_feature_name(ent.entity_kind, language))
.sorted();
let filtered_query = remove_ranges(query, entities_ranges);
let filtered_query_tokens = tokenize_light(&*filtered_query, language);
let filtered_normalized_stemmed_tokens =
normalize_stem(filtered_query_tokens, self.stemmer);

vec![
filtered_normalized_stemmed_tokens,
normalized_stemmed_tokens,
builtin_entities_features,
entities_features,
word_cluster_features,
Expand Down Expand Up @@ -165,23 +154,10 @@ fn normalize_stem<S: Stemmer>(tokens: Vec<String>, opt_stemmer: Option<S>) -> Ve
.unwrap_or_else(|| tokens.iter().map(|t| normalize(t)).collect())
}

fn remove_ranges(text: &str, ranges: Vec<Range<usize>>) -> String {
let mut filtered_text = String::new();
let mut idx = 0;
for range in &ranges {
let substring = substring_with_char_range(text.to_string(), &(idx..range.start));
filtered_text.push_str(&*substring);
idx = range.end;
}
let suffix = substring_with_char_range(text.to_string(), &(idx..text.chars().count()));
filtered_text.push_str(&*suffix);
filtered_text
}

#[cfg(test)]
mod tests {
use super::{get_dataset_entities_features, get_word_cluster_features, normalize_stem,
remove_ranges, Featurizer};
Featurizer};

use configurations::{FeaturizerConfigConfiguration, FeaturizerConfiguration,
TfIdfVectorizerConfiguration};
Expand Down Expand Up @@ -327,17 +303,4 @@ mod tests {
];
assert_eq!(entities_features, expected_entities_features)
}

#[test]
fn remove_range_works() {
// Given
let text = "hello world";
let ranges = vec![1..3, 6..9];

// When
let filtered_text = remove_ranges(text, ranges);

// Then
assert_eq!("hlo ld", filtered_text);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ mod tests {
let ref actual_result = classification_result.unwrap().unwrap();
let expected_result = IntentClassifierResult {
intent_name: "MakeTea".to_string(),
probability: 0.60434574,
probability: 0.6514961,
};

// Then
Expand Down
13 changes: 6 additions & 7 deletions snips-nlu-lib/src/intent_parser/deterministic_intent_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use std::sync::Arc;

use regex::{Regex, RegexBuilder};

use builtin_entity_parsing::{BuiltinEntityParserFactory, CachingBuiltinEntityParser};
use configurations::DeterministicParserConfiguration;
use errors::*;
use intent_parser::{internal_parsing_result, IntentParser, InternalParsingResult};
Expand All @@ -15,20 +16,19 @@ use nlu_utils::string::{convert_to_char_range, substring_with_char_range, suffix
use nlu_utils::token::{tokenize, tokenize_light};
use slot_utils::*;
use snips_nlu_ontology::Language;
use snips_nlu_ontology_parsers::BuiltinEntityParser;

pub struct DeterministicIntentParser {
regexes_per_intent: HashMap<String, Vec<Regex>>,
group_names_to_slot_names: HashMap<String, String>,
slot_names_to_entities: HashMap<String, String>,
builtin_entity_parser: Arc<BuiltinEntityParser>,
builtin_entity_parser: Arc<CachingBuiltinEntityParser>,
language: Language,
}

impl DeterministicIntentParser {
pub fn new(configuration: DeterministicParserConfiguration) -> Result<Self> {
let language = Language::from_str(&configuration.language_code)?;
let builtin_entity_parser = BuiltinEntityParser::get(language);
let builtin_entity_parser = BuiltinEntityParserFactory::get(language);
Ok(DeterministicIntentParser {
regexes_per_intent: compile_regexes_per_intent(configuration.patterns)?,
group_names_to_slot_names: configuration.group_names_to_slot_names,
Expand Down Expand Up @@ -193,9 +193,9 @@ fn deduplicate_overlapping_slots(

fn replace_builtin_entities(
text: &str,
parser: &BuiltinEntityParser,
parser: &CachingBuiltinEntityParser,
) -> (HashMap<Range<usize>, Range<usize>>, String) {
let builtin_entities = parser.extract_entities(text, None);
let builtin_entities = parser.extract_entities(text, None, true);
if builtin_entities.is_empty() {
return (HashMap::new(), text.to_string());
}
Expand Down Expand Up @@ -278,7 +278,6 @@ mod tests {
use configurations::DeterministicParserConfiguration;
use slot_utils::InternalSlot;
use snips_nlu_ontology::{IntentClassifierResult, Language};
use snips_nlu_ontology_parsers::BuiltinEntityParser;
use std::collections::HashMap;
use std::iter::FromIterator;

Expand Down Expand Up @@ -520,7 +519,7 @@ mod tests {
fn should_replace_builtin_entities() {
// Given
let text = "Meeting this evening or tomorrow at 11am !";
let parser = BuiltinEntityParser::get(Language::EN);
let parser = BuiltinEntityParserFactory::get(Language::EN);

// When
let (range_mapping, formatted_text) = replace_builtin_entities(text, &*parser);
Expand Down
Loading

0 comments on commit 155b813

Please sign in to comment.