注意:这是在弹性搜索论坛(https://discuss.elastic.co/t/store-size-1-000-times-the-document-byte-size/74258/4)上交叉发布的。Elasticsearch存储大小是文档字节大小的1000倍
我在store.size上增加了大约1000倍的文档字节大小。我用非常小的文档(小于1kb)绘制了一个非常简单的映射,并且我将映射与Elasticsearch的内部映射进行了比较,它们是相同的,所以它没有出现任何动态映射。到目前为止,我已经摄入了60,437个文档,并且存储大小为19.6Gb(平均每个文档为300kb),但JSON的平均字节大小(String.getBytes()。length)为300-400每个文档的字节数在另一次运行中,文档平均每个文档大约1MB到3MB。
我在M4.2xlarge EC2实例上使用Elasticsearch 5.2。 Elasticsearch主要安装了所有的默认设置,除了我需要做的是通过boostrap检查并绑定到非本地IP。我已经为Elasticsearch分配了16GB(一半的物理内存)。
我曾经运行Elasticsearch 2.x,并且正在摄取FAR更多的字段和大得多的文档,而不仅仅是这些少数几个字段,并且只经历了大约20k /文档,虽然仍然可管理,但它仍然很实用。
如果有人能指出任何可以解决这个问题的东西,我将不胜感激。或者是否有一个我没有看到的ES 5.x配置可以解决这个问题?
下面是我的映射。
{
"settings": {
"index.query.default_field": "tweetText"
},
"mappings": {
"tweet": {
"_all": {
"enabled": false
},
"properties": {
"tweetDate": {
"type": "date",
"format": "EEE MMM dd HH:mm:ss Z YYYY||strict_date_optional_time||epoch_millis"
},
"userId": {
"type": "text",
"index": "not_analyzed"
},
"screenName": {
"type": "text",
"index": "not_analyzed"
},
"tweetText": {
"type": "text"
},
"cleanedText": {
"type": "text"
},
"tweetId": {
"type": "text",
"index": "not_analyzed"
},
"location": {
"type": "geo_point",
"ignore_malformed": true
},
"placeName": {
"type": "keyword",
"doc_values": true,
"eager_global_ordinals": false
},
"placeCountry": {
"type": "keyword",
"doc_values": true,
"eager_global_ordinals": true
},
"placeCountryCode": {
"type": "keyword",
"doc_values": false,
"eager_global_ordinals": false,
"index": false
},
"placeBoundingBox": {
"type": "geo_shape",
"tree": "quadtree",
"precision": "1m"
},
"resolvedUrls": {
"type": "text",
"index": "not_analyzed"
},
"hashtags": {
"type": "text"
},
"mentions": {
"type": "text"
},
"geoInferences": {
"properties": {
"matchedName": {
"type": "text"
},
"asciiName": {
"type": "keyword",
"doc_values": true,
"eager_global_ordinals": false
},
"country": {
"type": "keyword",
"doc_values": true,
"eager_global_ordinals": true
},
"county": {
"type": "text"
},
"countryCode": {
"type": "keyword",
"doc_values": false,
"eager_global_ordinals": false,
"index": false
},
"city": {
"type": "text"
},
"admin1Code": {
"type": "keyword",
"doc_values": false,
"eager_global_ordinals": false,
"index": false
},
"admin2Code": {
"type": "keyword",
"doc_values": false,
"eager_global_ordinals": false,
"index": false
},
"admin3Code": {
"type": "keyword",
"doc_values": false,
"eager_global_ordinals": false,
"index": false
},
"admin4Code": {
"type": "keyword",
"doc_values": false,
"eager_global_ordinals": false,
"index": false
},
"confidence": {
"type": "float",
"doc_values": false,
"ignore_malformed": false,
"index": false
},
"coordinates": {
"type": "geo_point",
"ignore_malformed": true
}
}
},
"temporalInferences": {
"type": "date",
"ignore_malformed": true
}
}
}
}
}
样品文件:
{
"_index": "twitter",
"_type": "tweet",
"_id": "AVoZivLca9LOhnR10_ll",
"_score": null,
"_source": {
"tweetDate": 1486487211000,
"userId": "123456789",
"screenName": "removed",
"tweetText": "RT @wef: America’s dominance is over. By 2030, we'll have a handful of global powers https://www.weforum.org/agenda/2016/11/america-s-dominance-is-over/?utm_content=buffer73cd5&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer #wef17 https://twitter.com/wef/status/828994745200435200/photo/1",
"cleanedText": "RT @wef: America s dominance is over. By 2030, we'll have a handful of global powers https://www.weforum.org/agenda/2016/11/america-s-dominance-is-over/?utm_content=buffer73cd5&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer #wef17 https://twitter.com/wef/status/828994745200435200/photo/1",
"tweetId": "829013568288796672",
"resolvedUrls": [
"https://www.weforum.org/agenda/2016/11/america-s-dominance-is-over/?utm_content=buffer73cd5&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer"
],
"hashtags": [
"wef17"
],
"mentions": [
"wef"
],
"geoInferences": [
{
"matchedName": "America",
"asciiName": "United States",
"country": "United States",
"countryCode": "US",
"coordinates": [
-98.5,
39.76
],
"admin1Code": "00",
"admin2Code": "",
"admin3Code": "",
"admin4Code": "",
"confidence": 1
}
],
"temporalInferences": [
1893474000000
]
},
"fields": {
"temporalInferences": [
1893474000000
],
"tweetDate": [
1486487211000
]
},
"sort": [
1486487211000
]
}
输出从
GET /_cat/indices/twitter?pri&v&h=health,index,pri,rep,docs.count,mt,pri,rep,docs.count,store.size,pri.store.size
health | index | pri | rep | docs.count | mt | pri.mt | store.size | pri.store.size | pri.store.size
yellow | twitter | 5 | 1 | 26860 | 74 | 74 | 10.1gb | 10.1gb | 10.1gb
从输出:
GET /twitter/_stats
{
"_shards": {
"total": 10,
"successful": 5,
"failed": 0
},
"_all": {
"primaries": {
"docs": {
"count": 26860,
"deleted": 0
},
"store": {
"size_in_bytes": 11027965678,
"throttle_time_in_millis": 0
},
"indexing": {
"index_total": 27397,
"index_time_in_millis": 3568991,
"index_current": 1,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 195961
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 55,
"query_time_in_millis": 294,
"query_current": 0,
"fetch_total": 36,
"fetch_time_in_millis": 3209,
"fetch_current": 0,
"scroll_total": 0,
"scroll_time_in_millis": 0,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 76,
"total_time_in_millis": 350987,
"total_docs": 45409,
"total_size_in_bytes": 4027595474,
"total_stopped_time_in_millis": 0,
"total_throttled_time_in_millis": 48633,
"total_auto_throttle_in_bytes": 82233108
},
"refresh": {
"total": 857,
"total_time_in_millis": 2994887,
"listeners": 0
},
"flush": {
"total": 15,
"total_time_in_millis": 291939
},
"warmer": {
"current": 0,
"total": 876,
"total_time_in_millis": 534
},
"query_cache": {
"memory_size_in_bytes": 0,
"total_count": 0,
"hit_count": 0,
"miss_count": 0,
"cache_size": 0,
"cache_count": 0,
"evictions": 0
},
"fielddata": {
"memory_size_in_bytes": 24808,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 139,
"memory_in_bytes": 186032131,
"terms_memory_in_bytes": 185758725,
"stored_fields_memory_in_bytes": 43976,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 77888,
"points_memory_in_bytes": 714,
"doc_values_memory_in_bytes": 150828,
"index_writer_memory_in_bytes": 1316180948,
"version_map_memory_in_bytes": 42250,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": -1,
"file_sizes": {
}
},
"translog": {
"operations": 11997,
"size_in_bytes": 5555179
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 195,
"miss_count": 195
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
}
},
"total": {
"docs": {
"count": 26860,
"deleted": 0
},
"store": {
"size_in_bytes": 11027965678,
"throttle_time_in_millis": 0
},
"indexing": {
"index_total": 27397,
"index_time_in_millis": 3568991,
"index_current": 1,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 195961
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 55,
"query_time_in_millis": 294,
"query_current": 0,
"fetch_total": 36,
"fetch_time_in_millis": 3209,
"fetch_current": 0,
"scroll_total": 0,
"scroll_time_in_millis": 0,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 76,
"total_time_in_millis": 350987,
"total_docs": 45409,
"total_size_in_bytes": 4027595474,
"total_stopped_time_in_millis": 0,
"total_throttled_time_in_millis": 48633,
"total_auto_throttle_in_bytes": 82233108
},
"refresh": {
"total": 857,
"total_time_in_millis": 2994887,
"listeners": 0
},
"flush": {
"total": 15,
"total_time_in_millis": 291939
},
"warmer": {
"current": 0,
"total": 876,
"total_time_in_millis": 534
},
"query_cache": {
"memory_size_in_bytes": 0,
"total_count": 0,
"hit_count": 0,
"miss_count": 0,
"cache_size": 0,
"cache_count": 0,
"evictions": 0
},
"fielddata": {
"memory_size_in_bytes": 24808,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 139,
"memory_in_bytes": 186032131,
"terms_memory_in_bytes": 185758725,
"stored_fields_memory_in_bytes": 43976,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 77888,
"points_memory_in_bytes": 714,
"doc_values_memory_in_bytes": 150828,
"index_writer_memory_in_bytes": 1316180948,
"version_map_memory_in_bytes": 42250,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": -1,
"file_sizes": {
}
},
"translog": {
"operations": 11997,
"size_in_bytes": 5555179
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 195,
"miss_count": 195
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
}
}
},
"indices": {
"twitter": {
"primaries": {
"docs": {
"count": 26860,
"deleted": 0
},
"store": {
"size_in_bytes": 11027965678,
"throttle_time_in_millis": 0
},
"indexing": {
"index_total": 27397,
"index_time_in_millis": 3568991,
"index_current": 1,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 195961
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 55,
"query_time_in_millis": 294,
"query_current": 0,
"fetch_total": 36,
"fetch_time_in_millis": 3209,
"fetch_current": 0,
"scroll_total": 0,
"scroll_time_in_millis": 0,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 76,
"total_time_in_millis": 350987,
"total_docs": 45409,
"total_size_in_bytes": 4027595474,
"total_stopped_time_in_millis": 0,
"total_throttled_time_in_millis": 48633,
"total_auto_throttle_in_bytes": 82233108
},
"refresh": {
"total": 857,
"total_time_in_millis": 2994887,
"listeners": 0
},
"flush": {
"total": 15,
"total_time_in_millis": 291939
},
"warmer": {
"current": 0,
"total": 876,
"total_time_in_millis": 534
},
"query_cache": {
"memory_size_in_bytes": 0,
"total_count": 0,
"hit_count": 0,
"miss_count": 0,
"cache_size": 0,
"cache_count": 0,
"evictions": 0
},
"fielddata": {
"memory_size_in_bytes": 24808,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 139,
"memory_in_bytes": 186032131,
"terms_memory_in_bytes": 185758725,
"stored_fields_memory_in_bytes": 43976,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 77888,
"points_memory_in_bytes": 714,
"doc_values_memory_in_bytes": 150828,
"index_writer_memory_in_bytes": 1316180948,
"version_map_memory_in_bytes": 42250,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": -1,
"file_sizes": {
}
},
"translog": {
"operations": 11997,
"size_in_bytes": 5555179
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 195,
"miss_count": 195
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
}
},
"total": {
"docs": {
"count": 26860,
"deleted": 0
},
"store": {
"size_in_bytes": 11027965678,
"throttle_time_in_millis": 0
},
"indexing": {
"index_total": 27397,
"index_time_in_millis": 3568991,
"index_current": 1,
"index_failed": 0,
"delete_total": 0,
"delete_time_in_millis": 0,
"delete_current": 0,
"noop_update_total": 0,
"is_throttled": false,
"throttle_time_in_millis": 195961
},
"get": {
"total": 0,
"time_in_millis": 0,
"exists_total": 0,
"exists_time_in_millis": 0,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 55,
"query_time_in_millis": 294,
"query_current": 0,
"fetch_total": 36,
"fetch_time_in_millis": 3209,
"fetch_current": 0,
"scroll_total": 0,
"scroll_time_in_millis": 0,
"scroll_current": 0,
"suggest_total": 0,
"suggest_time_in_millis": 0,
"suggest_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 76,
"total_time_in_millis": 350987,
"total_docs": 45409,
"total_size_in_bytes": 4027595474,
"total_stopped_time_in_millis": 0,
"total_throttled_time_in_millis": 48633,
"total_auto_throttle_in_bytes": 82233108
},
"refresh": {
"total": 857,
"total_time_in_millis": 2994887,
"listeners": 0
},
"flush": {
"total": 15,
"total_time_in_millis": 291939
},
"warmer": {
"current": 0,
"total": 876,
"total_time_in_millis": 534
},
"query_cache": {
"memory_size_in_bytes": 0,
"total_count": 0,
"hit_count": 0,
"miss_count": 0,
"cache_size": 0,
"cache_count": 0,
"evictions": 0
},
"fielddata": {
"memory_size_in_bytes": 24808,
"evictions": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 139,
"memory_in_bytes": 186032131,
"terms_memory_in_bytes": 185758725,
"stored_fields_memory_in_bytes": 43976,
"term_vectors_memory_in_bytes": 0,
"norms_memory_in_bytes": 77888,
"points_memory_in_bytes": 714,
"doc_values_memory_in_bytes": 150828,
"index_writer_memory_in_bytes": 1316180948,
"version_map_memory_in_bytes": 42250,
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": -1,
"file_sizes": {
}
},
"translog": {
"operations": 11997,
"size_in_bytes": 5555179
},
"request_cache": {
"memory_size_in_bytes": 0,
"evictions": 0,
"hit_count": 195,
"miss_count": 195
},
"recovery": {
"current_as_source": 0,
"current_as_target": 0,
"throttle_time_in_millis": 0
}
}
}
}
}
EDIT 1 我发现了这个问题的根源。看起来这是错误的边框,尽管我不知道为什么。
一旦我从摄取的数据中删除边界框,索引是正常大小(600个文档 - > 550kb),但只要我重新添加边界框(带有全新索引),尺寸为天空(3,593个文档 - > 1.6GB),只有84个包含边框的文档。
下面是边框的JSON:
"placeBoundingBox": {
"type": "polygon",
"coordinates": [
[
[
-71.191421,
42.227797
],
[
-71.191421,
42.399542
],
[
-70.986004,
42.399542
],
[
-70.986004,
42.227797
],
[
-71.191421,
42.227797
]
]
]
}
与边框相关的(从调用get/INDEX_NAME)的映射:
"placeBoundingBox": {
"type": "geo_shape",
"tree": "quadtree",
"precision": "1.0m"
}
证明映射确实INFACT工作并且正在创建一个合适的geo_shape(尽管Kibana不会将其识别为geo_shape),但我运行了以下查询并获得了成功命中:
GET /_search
{
"query": {
"bool": {
"must": {
"match_all": {
}
},
"filter": {
"geo_shape": {
"placeBoundingBox": {
"shape": {
"type": "polygon",
"coordinates": [
[
[
-71.191421,
42.227797
],
[
-71.191421,
42.399542
],
[
-70.986004,
42.399542
],
[
-70.986004,
42.227797
],
[
-71.191421,
42.227797
]
]
]
},
"relation": "within"
}
}
}
}
}
}
我想将边界框保存在内,映射或数据是否有问题? 1.0米太细了吗?
一些问题,以前的用例的碎片/副本呢?因为现在你有5个碎片和5个副本(如果我没有弄错),可能是这个原因,你看到的大小是一个大数字 – Mysterion
我添加副本字段到映射,并将其设置为1,没有变化。 – Brooks
你能把它设置为0吗?它会改变大小吗? – Mysterion