所以我用这个“教程”,开始在elasticsearch特殊字符的工作:https://www.elastic.co/guide/en/elasticsearch/guide/current/case-folding.htmlElasticsearch:与ICU插件的特殊字符
我安装了elasticsearch分析ICU的版本2.7.0对我elasticsearch 1.7.x版
创建索引 “sonderzeichen” 与icu_tokenizer运行良好(I使用的NodeJS):
:我使用的两个分析器var http = require('http');
var body = JSON.stringify(
{
"settings": {
"analysis": {
"analyzer": {
"my_lowercaser": {
"tokenizer": "icu_tokenizer",
"filter": [ "icu_normalizer" ]
}
}
}
}
}
);
var options = {
host: 'localhost',
path: '/sonderzeichen',
port: 9200,
method: "PUT",
headers: {
'Content-Type': 'application/json',
'Content-Length': body.length
}
};
callback = function(response) {
var str = '';
response.on('data', function(chunk){
str += chunk;
});
response.on('end', function(){
console.log(str);
});
};
http.request(options, callback).end(body);
如教程中描述
/_analyze?analyzer=my_lowercaser
和
/sonderzeichen/_analyze?analyzer=my_lowercaser
在节点它看起来像这样:
var http = require('http');
var body = decodeURIComponent("Weißkopfseeadler WEISSKOPFSEEADLER äÄöÖüÜßáÁéÉíÍóÓúÚàÀèÈìÌòÒùÙ");
var options = {
host: 'localhost',
path: '/_analyze?analyzer=standard',
port: 9200,
method: "GET",
headers: {
'Content-Type': 'application/json',
'Content-Length': body.length
}
};
callback = function(response) {
var str = '';
response.on('data', function(chunk){
str += chunk;
});
response.on('end', function(){
console.log(str);
});
};
http.request(options, callback).end(body);
都返回完全相同的令牌,(如果我使用decodeURIComponent与否不论)如下:
{
"tokens": [
{
"token": "wei",
"start_offset": 0,
"end_offset": 3,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "kopfseeadler",
"start_offset": 4,
"end_offset": 16,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "weisskopfseeadler",
"start_offset": 17,
"end_offset": 34,
"type": "<ALPHANUM>",
"position": 3
}
]
}
弹性仍然似乎无法处理任何特殊字符,所以我去哪里wron G?
如果你只处理与德国,我敢确信[ 'asciifolding'](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-asciifolding-tokenfilter.html)令牌过滤器绰绰有余。在利用UCI插件之前,您应该尝试一下。 – Val
我试过了,我必须使用它: 删除索引并重新创建它,替换'“分析”:{“analyzer”:{“my_lowercaser”:{“tokenizer”:“icu_tokenizer”,“filter”:[分析“:{”analyser“:{”default“:{”tokenizer“:”standard“,”filter“:[”standard“,”asciifolding“]}'并分析我用过'/sonderzeichen/_analyze?分析仪= default'。仍然得到相同的答案 – JTR