0%

elasticsearch分析

elasticsearch分析

索引分析是把一个文本块分析成一个个单独的词,为了后面的倒排索引做准备,然后标准化这些词为标准形式,提高它们的可搜索性,这些工作是分析器完成的,一个分析器是一个组合,用于将三个功能放在一起

  • 字符过滤器(character filter):字符串经过字符串过滤器处理,它们的工作是在标记化之前处理字符串,字符串过滤器可以去除HTML标记
  • 分词器(tokenizer):分词器被标记化成独立的词,一个简单的分词器可以根据空格或逗号分开
  • 标记过滤器(token filters):每个词都通过所有标记过滤处理,可以修改词、去掉词或者增加词

elasticsearch有很多内置的分析器,也可以自定义分析器

内置分析器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
GET _analyze
{
"analyzer":"standard", // 分析器
"text":"test analyze" // 分析的语句
}


// 分析结果
{
"tokens": [
{
"token": "test",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "analyze",
"start_offset": 5,
"end_offset": 12,
"type": "<ALPHANUM>",
"position": 1
}
]
}

自定义分析器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
POST _analyze
{
"tokenizer": "standard", // 分词器
"filter": [ //标记过滤器
"lowercase" // 转小写
],
"char_filter": [ // 字符过滤器
"html_strip" // 过滤掉html字符
],
"text": "this is my <b>TITLE</b>"
}


// 分析结果
{
"tokens": [
{
"token": "this",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "is",
"start_offset": 5,
"end_offset": 7,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "my",
"start_offset": 8,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "title",
"start_offset": 14,
"end_offset": 23,
"type": "<ALPHANUM>",
"position": 3
}
]
}

可以使用explain来输出分词器的细节

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
POST _analyze
{
"tokenizer": "standard",
"filter": [
"lowercase"
],
"char_filter": [
"html_strip"
],
"text": "this is my <b>TITLE</b>",
"explain": true
}



{
"detail": {
"custom_analyzer": true,
"charfilters": [
{
"name": "html_strip",
"filtered_text": [
"this is my TITLE"
]
}
],
"tokenizer": {
"name": "standard",
"tokens": [
{
"token": "this",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0,
"bytes": "[74 68 69 73]",
"positionLength": 1,
"termFrequency": 1
},
{
"token": "is",
"start_offset": 5,
"end_offset": 7,
"type": "<ALPHANUM>",
"position": 1,
"bytes": "[69 73]",
"positionLength": 1,
"termFrequency": 1
},
{
"token": "my",
"start_offset": 8,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 2,
"bytes": "[6d 79]",
"positionLength": 1,
"termFrequency": 1
},
{
"token": "TITLE",
"start_offset": 14,
"end_offset": 23,
"type": "<ALPHANUM>",
"position": 3,
"bytes": "[54 49 54 4c 45]",
"positionLength": 1,
"termFrequency": 1
}
]
},
"tokenfilters": [
{
"name": "lowercase",
"tokens": [
{
"token": "this",
"start_offset": 0,
"end_offset": 4,
"type": "<ALPHANUM>",
"position": 0,
"bytes": "[74 68 69 73]",
"positionLength": 1,
"termFrequency": 1
},
{
"token": "is",
"start_offset": 5,
"end_offset": 7,
"type": "<ALPHANUM>",
"position": 1,
"bytes": "[69 73]",
"positionLength": 1,
"termFrequency": 1
},
{
"token": "my",
"start_offset": 8,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 2,
"bytes": "[6d 79]",
"positionLength": 1,
"termFrequency": 1
},
{
"token": "title",
"start_offset": 14,
"end_offset": 23,
"type": "<ALPHANUM>",
"position": 3,
"bytes": "[74 69 74 6c 65]",
"positionLength": 1,
"termFrequency": 1
}
]
}
]
}
}