From bdb00db7be58269f1e87c364a0463cf9f55d933f Mon Sep 17 00:00:00 2001 From: Thomas Queste Date: Tue, 16 Jul 2019 10:43:27 +0200 Subject: [PATCH] doc: fix code block starting at standardnumber Some code blocks at the end were recognized as code in github. Fix: all block code have been wrapped in asciidoc source code block. --- README.adoc | 412 ++++++++++++++++++++++++++++------------------------ 1 file changed, 222 insertions(+), 190 deletions(-) diff --git a/README.adoc b/README.adoc index aa2b1282..37a9e645 100644 --- a/README.adoc +++ b/README.adoc @@ -545,6 +545,8 @@ POST /test/docs/_search?explain ## Baseform Try it out + +[source] ---- GET _analyze { @@ -559,31 +561,35 @@ GET _analyze } ---- - { - "index":{ - "analysis":{ - "filter":{ - "baseform":{ - "type" : "baseform", - "language" : "de" - } - }, - "tokenizer" : { - "baseform" : { - "type" : "standard", - "filter" : [ "baseform", "unique" ] - } - } - } - } - } - +[source] +---- +{ + "index":{ + "analysis":{ + "filter":{ + "baseform":{ + "type" : "baseform", + "language" : "de" + } + }, + "tokenizer" : { + "baseform" : { + "type" : "standard", + "filter" : [ "baseform", "unique" ] + } + } + } + } +} +---- ## WordDelimiterFilter2 Try it out + +[source] ---- GET _analyze { @@ -597,23 +603,26 @@ GET _analyze } ---- - { - "index":{ - "analysis":{ - "filter" : { - "wd" : { - "type" : "worddelimiter2", - "generate_word_parts" : true, - "generate_number_parts" : true, - "catenate_all" : true, - "split_on_case_change" : true, - "split_on_numerics" : true, - "stem_english_possessive" : true - } +[source] +---- +{ + "index":{ + "analysis":{ + "filter" : { + "wd" : { + "type" : "worddelimiter2", + "generate_word_parts" : true, + "generate_number_parts" : true, + "catenate_all" : true, + "split_on_case_change" : true, + "split_on_numerics" : true, + "stem_english_possessive" : true } } } } +} +---- # Decompound @@ -637,6 +646,8 @@ by the ASV toolbox. ## Decompound examples Try it out + +[source] ---- GET _analyze { @@ -650,52 +661,58 @@ GET _analyze } ---- -In the mapping, use a token filter of type "decompound":: +In the mapping, use a token filter of type "decompound": - { - "index":{ - "analysis":{ - "filter":{ - "decomp":{ - "type" : "decompound" - } - }, - "tokenizer" : { - "decomp" : { - "type" : "standard", - "filter" : [ "decomp" ] - } - } - } - } - } +[source] +---- +{ + "index":{ + "analysis":{ + "filter":{ + "decomp":{ + "type" : "decompound" + } + }, + "tokenizer" : { + "decomp" : { + "type" : "standard", + "filter" : [ "decomp" ] + } + } + } + } +} +---- -"Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet" will be tokenized into +"Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet" will be tokenized into "Die", "Die", "Jahresfeier", "Jahr", "feier", "der", "der", "Rechtsanwaltskanzleien", "Recht", "anwalt", "kanzlei", "auf", "auf", "dem", "dem", "Donaudampfschiff", "Donau", "dampf", "schiff", "hat", "hat", "viel", "viel", "Ökosteuer", "Ökosteuer", "gekostet", "gekosten" It is recommended to add the `Unique token filter `_ to skip tokens that occur more than once. -Also the Lucene german normalization token filter is provided:: +Also the Lucene german normalization token filter is provided: - { - "index":{ - "analysis":{ - "filter":{ - "umlaut":{ - "type":"german_normalize" - } - }, - "tokenizer" : { - "umlaut" : { - "type":"standard", - "filter" : "umlaut" - } - } - } - } +[source] +---- +{ + "index":{ + "analysis":{ + "filter":{ + "umlaut":{ + "type":"german_normalize" + } + }, + "tokenizer" : { + "umlaut" : { + "type":"standard", + "filter" : "umlaut" + } + } } + } +} +---- -The input "Ein schöner Tag in Köln im Café an der Straßenecke" will be tokenized into +The input "Ein schöner Tag in Köln im Café an der Straßenecke" will be tokenized into "Ein", "schoner", "Tag", "in", "Koln", "im", "Café", "an", "der", "Strassenecke". ## Threshold @@ -704,50 +721,55 @@ The decomposing algorithm knows about a threshold when to assume words as decomp If the threshold is too low, words could silently disappear from being indexed. In this case, you have to adapt the threshold so words do no longer disappear. -The default threshold value is 0.51. You can modify it in the settings:: +The default threshold value is 0.51. You can modify it in the settings: - { - "index" : { - "analysis" : { - "filter" : { - "decomp" : { - "type" : "decompound", - "threshold" : 0.51 - } - }, - "tokenizer" : { - "decomp" : { - "type" : "standard", - "filter" : [ "decomp" ] - } - } - } - } +[source] +---- +{ + "index" : { + "analysis" : { + "filter" : { + "decomp" : { + "type" : "decompound", + "threshold" : 0.51 + } + }, + "tokenizer" : { + "decomp" : { + "type" : "standard", + "filter" : [ "decomp" ] + } + } } - + } +} +---- + ## Subwords - + Sometimes only the decomposed subwords should be indexed. For this, you can use the parameter `"subwords_only": true` - { - "index" : { - "analysis" : { - "filter" : { - "decomp" : { - "type" : "decompound", - "subwords_only" : true - } - }, - "tokenizer" : { - "decomp" : { - "type" : "standard", - "filter" : [ "decomp" ] - } - } - } - } +[source] +---- +{ + "index" : { + "analysis" : { + "filter" : { + "decomp" : { + "type" : "decompound", + "subwords_only" : true + } + }, + "tokenizer" : { + "decomp" : { + "type" : "standard", + "filter" : [ "decomp" ] + } + } } - + } +} +---- ## Caching @@ -759,7 +781,8 @@ with the following settings: `cache_size` - sets cache size, default: 100000 `cache_eviction_factor` - sets cache eviction factor, valida values are between 0.00 and 1.00, default: 0.90 -``` +[source] +---- { "settings": { "index": { @@ -805,7 +828,7 @@ with the following settings: } } } -``` +---- ## Exact phrase matches @@ -820,7 +843,8 @@ containing your phrase queries. `use_payload` - if set to true, enable payload creation. Default: false - ``` +[source] +---- { "query": { "exact_phrase": { @@ -835,79 +859,84 @@ containing your phrase queries. } } } -``` +---- # Langdetect - curl -XDELETE 'localhost:9200/test' +[source] +---- +curl -XDELETE 'localhost:9200/test' - curl -XPUT 'localhost:9200/test' +curl -XPUT 'localhost:9200/test' - curl -XPOST 'localhost:9200/test/article/_mapping' -d ' - { - "article" : { - "properties" : { - "content" : { "type" : "langdetect" } - } - } +curl -XPOST 'localhost:9200/test/article/_mapping' -d ' +{ + "article" : { + "properties" : { + "content" : { "type" : "langdetect" } } - ' + } +} +' - curl -XPUT 'localhost:9200/test/article/1' -d ' - { - "title" : "Some title", - "content" : "Oh, say can you see by the dawn`s early light, What so proudly we hailed at the twilight`s last gleaming?" - } - ' +curl -XPUT 'localhost:9200/test/article/1' -d ' +{ + "title" : "Some title", + "content" : "Oh, say can you see by the dawn`s early light, What so proudly we hailed at the twilight`s last gleaming?" +} +' - curl -XPUT 'localhost:9200/test/article/2' -d ' - { - "title" : "Ein Titel", - "content" : "Einigkeit und Recht und Freiheit für das deutsche Vaterland!" - } - ' +curl -XPUT 'localhost:9200/test/article/2' -d ' +{ + "title" : "Ein Titel", + "content" : "Einigkeit und Recht und Freiheit für das deutsche Vaterland!" +} +' - curl -XPUT 'localhost:9200/test/article/3' -d ' - { - "title" : "Un titre", - "content" : "Allons enfants de la Patrie, Le jour de gloire est arrivé!" - } - ' +curl -XPUT 'localhost:9200/test/article/3' -d ' +{ + "title" : "Un titre", + "content" : "Allons enfants de la Patrie, Le jour de gloire est arrivé!" +} +' - curl -XGET 'localhost:9200/test/_refresh' +curl -XGET 'localhost:9200/test/_refresh' - curl -XPOST 'localhost:9200/test/_search' -d ' - { - "query" : { - "term" : { - "content" : "en" - } +curl -XPOST 'localhost:9200/test/_search' -d ' +{ + "query" : { + "term" : { + "content" : "en" } - } - ' - curl -XPOST 'localhost:9200/test/_search' -d ' - { - "query" : { - "term" : { - "content" : "de" - } + } +} +' +curl -XPOST 'localhost:9200/test/_search' -d ' +{ + "query" : { + "term" : { + "content" : "de" } - } - ' + } +} +' - curl -XPOST 'localhost:9200/test/_search' -d ' - { - "query" : { - "term" : { - "content" : "fr" - } +curl -XPOST 'localhost:9200/test/_search' -d ' +{ + "query" : { + "term" : { + "content" : "fr" } - } - ' + } +} +' +---- # Standardnumber Try it out + +[source] ---- GET _analyze { @@ -921,24 +950,26 @@ GET _analyze } ---- - { - "index" : { - "analysis" : { - "filter" : { - "standardnumber" : { - "type" : "standardnumber" - } - }, - "analyzer" : { - "standardnumber" : { - "tokenizer" : "whitespace", - "filter" : [ "standardnumber", "unique" ] - } +[source] +---- +{ + "index" : { + "analysis" : { + "filter" : { + "standardnumber" : { + "type" : "standardnumber" + } + }, + "analyzer" : { + "standardnumber" : { + "tokenizer" : "whitespace", + "filter" : [ "standardnumber", "unique" ] } } - } - } - + } + } +} +---- - WordDelimiterFilter2: taken from Lucene @@ -961,16 +992,19 @@ GET _analyze ## Crypt mapper - { - "someType" : { - "_source" : { - "enabled": false - }, - "properties" : { - "someField":{ "type" : "crypt", "algo": "SHA-512" } - } +[source] +---- +{ + "someType" : { + "_source" : { + "enabled": false + }, + "properties" : { + "someField":{ "type" : "crypt", "algo": "SHA-512" } } } +} +---- ## Issues @@ -994,8 +1028,6 @@ The base form reduction step (for Norwegian) is described in *Eiken, U.C., Liseth, A.T., Richter, M., Witschel, F. and Biemann, C.: Ord i Dag: Mining Norwegian Daily Newswire. Proceedings of FinTAL, Turku, 2006, Finland* - - # License elasticsearch-plugin-bundle - a compilation of useful plugins for Elasticsearch