2017-02-08 69 views
0

注意:这是在弹性搜索论坛(https://discuss.elastic.co/t/store-size-1-000-times-the-document-byte-size/74258/4)上交叉发布的。Elasticsearch存储大小是文档字节大小的1000倍

我在store.size上增加了大约1000倍的文档字节大小。我用非常小的文档(小于1kb)绘制了一个非常简单的映射,并且我将映射与Elasticsearch的内部映射进行了比较,它们是相同的,所以它没有出现任何动态映射。到目前为止,我已经摄入了60,437个文档,并且存储大小为19.6Gb(平均每个文档为300kb),但JSON的平均字节大小(String.getBytes()。length)为300-400每个文档的字节数在另一次运行中,文档平均每个文档大约1MB到3MB。

我在M4.2xlarge EC2实例上使用Elasticsearch 5.2。 Elasticsearch主要安装了所有的默认设置,除了我需要做的是通过boostrap检查并绑定到非本地IP。我已经为Elasticsearch分配了16GB(一半的物理内存)。

我曾经运行Elasticsearch 2.x,并且正在摄取FAR更多的字段和大得多的文档,而不仅仅是这些少数几个字段,并且只经历了大约20k /文档,虽然仍然可管理,但它仍然很实用。

如果有人能指出任何可以解决这个问题的东西,我将不胜感激。或者是否有一个我没有看到的ES 5.x配置可以解决这个问题?

下面是我的映射。

{ 
    "settings": { 
     "index.query.default_field": "tweetText" 
    }, 
    "mappings": { 
     "tweet": { 
      "_all": { 
       "enabled": false 
      }, 
      "properties": { 
       "tweetDate": { 
        "type": "date", 
        "format": "EEE MMM dd HH:mm:ss Z YYYY||strict_date_optional_time||epoch_millis" 
       }, 
       "userId": { 
        "type": "text", 
        "index": "not_analyzed" 
       }, 
       "screenName": { 
        "type": "text", 
        "index": "not_analyzed" 
       }, 
       "tweetText": { 
        "type": "text" 
       }, 
       "cleanedText": { 
        "type": "text" 
       }, 
       "tweetId": { 
        "type": "text", 
        "index": "not_analyzed" 
       }, 
       "location": { 
        "type": "geo_point", 
        "ignore_malformed": true 
       }, 
       "placeName": { 
        "type": "keyword", 
        "doc_values": true, 
        "eager_global_ordinals": false 
       }, 
       "placeCountry": { 
        "type": "keyword", 
        "doc_values": true, 
        "eager_global_ordinals": true 
       }, 
       "placeCountryCode": { 
        "type": "keyword", 
        "doc_values": false, 
        "eager_global_ordinals": false, 
        "index": false 
       }, 
       "placeBoundingBox": { 
        "type": "geo_shape", 
        "tree": "quadtree", 
        "precision": "1m" 
       }, 
       "resolvedUrls": { 
        "type": "text", 
        "index": "not_analyzed" 
       }, 
       "hashtags": { 
        "type": "text" 
       }, 
       "mentions": { 
        "type": "text" 
       }, 
       "geoInferences": { 
        "properties": { 
         "matchedName": { 
          "type": "text" 
         }, 
         "asciiName": { 
          "type": "keyword", 
          "doc_values": true, 
          "eager_global_ordinals": false 
         }, 
         "country": { 
          "type": "keyword", 
          "doc_values": true, 
          "eager_global_ordinals": true 
         }, 
         "county": { 
          "type": "text" 
         }, 
         "countryCode": { 
          "type": "keyword", 
          "doc_values": false, 
          "eager_global_ordinals": false, 
          "index": false 
         }, 
         "city": { 
          "type": "text" 
         }, 
         "admin1Code": { 
          "type": "keyword", 
          "doc_values": false, 
          "eager_global_ordinals": false, 
          "index": false 
         }, 
         "admin2Code": { 
          "type": "keyword", 
          "doc_values": false, 
          "eager_global_ordinals": false, 
          "index": false 
         }, 
         "admin3Code": { 
          "type": "keyword", 
          "doc_values": false, 
          "eager_global_ordinals": false, 
          "index": false 
         }, 
         "admin4Code": { 
          "type": "keyword", 
          "doc_values": false, 
          "eager_global_ordinals": false, 
          "index": false 
         }, 
         "confidence": { 
          "type": "float", 
          "doc_values": false, 
          "ignore_malformed": false, 
          "index": false 
         }, 
         "coordinates": { 
          "type": "geo_point", 
          "ignore_malformed": true 
         } 
        } 
       }, 
       "temporalInferences": { 
        "type": "date", 
        "ignore_malformed": true 
       } 
      } 
     } 
    } 
} 

样品文件:

{ 
    "_index": "twitter", 
    "_type": "tweet", 
    "_id": "AVoZivLca9LOhnR10_ll", 
    "_score": null, 
    "_source": { 
    "tweetDate": 1486487211000, 
    "userId": "123456789", 
    "screenName": "removed", 
    "tweetText": "RT @wef: America’s dominance is over. By 2030, we'll have a handful of global powers https://www.weforum.org/agenda/2016/11/america-s-dominance-is-over/?utm_content=buffer73cd5&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer #wef17 https://twitter.com/wef/status/828994745200435200/photo/1", 
    "cleanedText": "RT @wef: America s dominance is over. By 2030, we'll have a handful of global powers https://www.weforum.org/agenda/2016/11/america-s-dominance-is-over/?utm_content=buffer73cd5&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer #wef17 https://twitter.com/wef/status/828994745200435200/photo/1", 
    "tweetId": "829013568288796672", 
    "resolvedUrls": [ 
     "https://www.weforum.org/agenda/2016/11/america-s-dominance-is-over/?utm_content=buffer73cd5&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer" 
    ], 
    "hashtags": [ 
     "wef17" 
    ], 
    "mentions": [ 
     "wef" 
    ], 
    "geoInferences": [ 
     { 
     "matchedName": "America", 
     "asciiName": "United States", 
     "country": "United States", 
     "countryCode": "US", 
     "coordinates": [ 
      -98.5, 
      39.76 
     ], 
     "admin1Code": "00", 
     "admin2Code": "", 
     "admin3Code": "", 
     "admin4Code": "", 
     "confidence": 1 
     } 
    ], 
    "temporalInferences": [ 
     1893474000000 
    ] 
    }, 
    "fields": { 
    "temporalInferences": [ 
     1893474000000 
    ], 
    "tweetDate": [ 
     1486487211000 
    ] 
    }, 
    "sort": [ 
    1486487211000 
    ] 
} 

输出从

GET /_cat/indices/twitter?pri&v&h=health,index,pri,rep,docs.count,mt,pri,rep,docs.count,store.size,pri.store.size 

health | index | pri | rep | docs.count | mt | pri.mt | store.size | pri.store.size | pri.store.size 
yellow | twitter | 5 | 1 | 26860 | 74 | 74 | 10.1gb | 10.1gb | 10.1gb 

从输出:

GET /twitter/_stats 

{ 
    "_shards": { 
    "total": 10, 
    "successful": 5, 
    "failed": 0 
    }, 
    "_all": { 
    "primaries": { 
     "docs": { 
     "count": 26860, 
     "deleted": 0 
     }, 
     "store": { 
     "size_in_bytes": 11027965678, 
     "throttle_time_in_millis": 0 
     }, 
     "indexing": { 
     "index_total": 27397, 
     "index_time_in_millis": 3568991, 
     "index_current": 1, 
     "index_failed": 0, 
     "delete_total": 0, 
     "delete_time_in_millis": 0, 
     "delete_current": 0, 
     "noop_update_total": 0, 
     "is_throttled": false, 
     "throttle_time_in_millis": 195961 
     }, 
     "get": { 
     "total": 0, 
     "time_in_millis": 0, 
     "exists_total": 0, 
     "exists_time_in_millis": 0, 
     "missing_total": 0, 
     "missing_time_in_millis": 0, 
     "current": 0 
     }, 
     "search": { 
     "open_contexts": 0, 
     "query_total": 55, 
     "query_time_in_millis": 294, 
     "query_current": 0, 
     "fetch_total": 36, 
     "fetch_time_in_millis": 3209, 
     "fetch_current": 0, 
     "scroll_total": 0, 
     "scroll_time_in_millis": 0, 
     "scroll_current": 0, 
     "suggest_total": 0, 
     "suggest_time_in_millis": 0, 
     "suggest_current": 0 
     }, 
     "merges": { 
     "current": 0, 
     "current_docs": 0, 
     "current_size_in_bytes": 0, 
     "total": 76, 
     "total_time_in_millis": 350987, 
     "total_docs": 45409, 
     "total_size_in_bytes": 4027595474, 
     "total_stopped_time_in_millis": 0, 
     "total_throttled_time_in_millis": 48633, 
     "total_auto_throttle_in_bytes": 82233108 
     }, 
     "refresh": { 
     "total": 857, 
     "total_time_in_millis": 2994887, 
     "listeners": 0 
     }, 
     "flush": { 
     "total": 15, 
     "total_time_in_millis": 291939 
     }, 
     "warmer": { 
     "current": 0, 
     "total": 876, 
     "total_time_in_millis": 534 
     }, 
     "query_cache": { 
     "memory_size_in_bytes": 0, 
     "total_count": 0, 
     "hit_count": 0, 
     "miss_count": 0, 
     "cache_size": 0, 
     "cache_count": 0, 
     "evictions": 0 
     }, 
     "fielddata": { 
     "memory_size_in_bytes": 24808, 
     "evictions": 0 
     }, 
     "completion": { 
     "size_in_bytes": 0 
     }, 
     "segments": { 
     "count": 139, 
     "memory_in_bytes": 186032131, 
     "terms_memory_in_bytes": 185758725, 
     "stored_fields_memory_in_bytes": 43976, 
     "term_vectors_memory_in_bytes": 0, 
     "norms_memory_in_bytes": 77888, 
     "points_memory_in_bytes": 714, 
     "doc_values_memory_in_bytes": 150828, 
     "index_writer_memory_in_bytes": 1316180948, 
     "version_map_memory_in_bytes": 42250, 
     "fixed_bit_set_memory_in_bytes": 0, 
     "max_unsafe_auto_id_timestamp": -1, 
     "file_sizes": { 

     } 
     }, 
     "translog": { 
     "operations": 11997, 
     "size_in_bytes": 5555179 
     }, 
     "request_cache": { 
     "memory_size_in_bytes": 0, 
     "evictions": 0, 
     "hit_count": 195, 
     "miss_count": 195 
     }, 
     "recovery": { 
     "current_as_source": 0, 
     "current_as_target": 0, 
     "throttle_time_in_millis": 0 
     } 
    }, 
    "total": { 
     "docs": { 
     "count": 26860, 
     "deleted": 0 
     }, 
     "store": { 
     "size_in_bytes": 11027965678, 
     "throttle_time_in_millis": 0 
     }, 
     "indexing": { 
     "index_total": 27397, 
     "index_time_in_millis": 3568991, 
     "index_current": 1, 
     "index_failed": 0, 
     "delete_total": 0, 
     "delete_time_in_millis": 0, 
     "delete_current": 0, 
     "noop_update_total": 0, 
     "is_throttled": false, 
     "throttle_time_in_millis": 195961 
     }, 
     "get": { 
     "total": 0, 
     "time_in_millis": 0, 
     "exists_total": 0, 
     "exists_time_in_millis": 0, 
     "missing_total": 0, 
     "missing_time_in_millis": 0, 
     "current": 0 
     }, 
     "search": { 
     "open_contexts": 0, 
     "query_total": 55, 
     "query_time_in_millis": 294, 
     "query_current": 0, 
     "fetch_total": 36, 
     "fetch_time_in_millis": 3209, 
     "fetch_current": 0, 
     "scroll_total": 0, 
     "scroll_time_in_millis": 0, 
     "scroll_current": 0, 
     "suggest_total": 0, 
     "suggest_time_in_millis": 0, 
     "suggest_current": 0 
     }, 
     "merges": { 
     "current": 0, 
     "current_docs": 0, 
     "current_size_in_bytes": 0, 
     "total": 76, 
     "total_time_in_millis": 350987, 
     "total_docs": 45409, 
     "total_size_in_bytes": 4027595474, 
     "total_stopped_time_in_millis": 0, 
     "total_throttled_time_in_millis": 48633, 
     "total_auto_throttle_in_bytes": 82233108 
     }, 
     "refresh": { 
     "total": 857, 
     "total_time_in_millis": 2994887, 
     "listeners": 0 
     }, 
     "flush": { 
     "total": 15, 
     "total_time_in_millis": 291939 
     }, 
     "warmer": { 
     "current": 0, 
     "total": 876, 
     "total_time_in_millis": 534 
     }, 
     "query_cache": { 
     "memory_size_in_bytes": 0, 
     "total_count": 0, 
     "hit_count": 0, 
     "miss_count": 0, 
     "cache_size": 0, 
     "cache_count": 0, 
     "evictions": 0 
     }, 
     "fielddata": { 
     "memory_size_in_bytes": 24808, 
     "evictions": 0 
     }, 
     "completion": { 
     "size_in_bytes": 0 
     }, 
     "segments": { 
     "count": 139, 
     "memory_in_bytes": 186032131, 
     "terms_memory_in_bytes": 185758725, 
     "stored_fields_memory_in_bytes": 43976, 
     "term_vectors_memory_in_bytes": 0, 
     "norms_memory_in_bytes": 77888, 
     "points_memory_in_bytes": 714, 
     "doc_values_memory_in_bytes": 150828, 
     "index_writer_memory_in_bytes": 1316180948, 
     "version_map_memory_in_bytes": 42250, 
     "fixed_bit_set_memory_in_bytes": 0, 
     "max_unsafe_auto_id_timestamp": -1, 
     "file_sizes": { 

     } 
     }, 
     "translog": { 
     "operations": 11997, 
     "size_in_bytes": 5555179 
     }, 
     "request_cache": { 
     "memory_size_in_bytes": 0, 
     "evictions": 0, 
     "hit_count": 195, 
     "miss_count": 195 
     }, 
     "recovery": { 
     "current_as_source": 0, 
     "current_as_target": 0, 
     "throttle_time_in_millis": 0 
     } 
    } 
    }, 
    "indices": { 
    "twitter": { 
     "primaries": { 
     "docs": { 
      "count": 26860, 
      "deleted": 0 
     }, 
     "store": { 
      "size_in_bytes": 11027965678, 
      "throttle_time_in_millis": 0 
     }, 
     "indexing": { 
      "index_total": 27397, 
      "index_time_in_millis": 3568991, 
      "index_current": 1, 
      "index_failed": 0, 
      "delete_total": 0, 
      "delete_time_in_millis": 0, 
      "delete_current": 0, 
      "noop_update_total": 0, 
      "is_throttled": false, 
      "throttle_time_in_millis": 195961 
     }, 
     "get": { 
      "total": 0, 
      "time_in_millis": 0, 
      "exists_total": 0, 
      "exists_time_in_millis": 0, 
      "missing_total": 0, 
      "missing_time_in_millis": 0, 
      "current": 0 
     }, 
     "search": { 
      "open_contexts": 0, 
      "query_total": 55, 
      "query_time_in_millis": 294, 
      "query_current": 0, 
      "fetch_total": 36, 
      "fetch_time_in_millis": 3209, 
      "fetch_current": 0, 
      "scroll_total": 0, 
      "scroll_time_in_millis": 0, 
      "scroll_current": 0, 
      "suggest_total": 0, 
      "suggest_time_in_millis": 0, 
      "suggest_current": 0 
     }, 
     "merges": { 
      "current": 0, 
      "current_docs": 0, 
      "current_size_in_bytes": 0, 
      "total": 76, 
      "total_time_in_millis": 350987, 
      "total_docs": 45409, 
      "total_size_in_bytes": 4027595474, 
      "total_stopped_time_in_millis": 0, 
      "total_throttled_time_in_millis": 48633, 
      "total_auto_throttle_in_bytes": 82233108 
     }, 
     "refresh": { 
      "total": 857, 
      "total_time_in_millis": 2994887, 
      "listeners": 0 
     }, 
     "flush": { 
      "total": 15, 
      "total_time_in_millis": 291939 
     }, 
     "warmer": { 
      "current": 0, 
      "total": 876, 
      "total_time_in_millis": 534 
     }, 
     "query_cache": { 
      "memory_size_in_bytes": 0, 
      "total_count": 0, 
      "hit_count": 0, 
      "miss_count": 0, 
      "cache_size": 0, 
      "cache_count": 0, 
      "evictions": 0 
     }, 
     "fielddata": { 
      "memory_size_in_bytes": 24808, 
      "evictions": 0 
     }, 
     "completion": { 
      "size_in_bytes": 0 
     }, 
     "segments": { 
      "count": 139, 
      "memory_in_bytes": 186032131, 
      "terms_memory_in_bytes": 185758725, 
      "stored_fields_memory_in_bytes": 43976, 
      "term_vectors_memory_in_bytes": 0, 
      "norms_memory_in_bytes": 77888, 
      "points_memory_in_bytes": 714, 
      "doc_values_memory_in_bytes": 150828, 
      "index_writer_memory_in_bytes": 1316180948, 
      "version_map_memory_in_bytes": 42250, 
      "fixed_bit_set_memory_in_bytes": 0, 
      "max_unsafe_auto_id_timestamp": -1, 
      "file_sizes": { 

      } 
     }, 
     "translog": { 
      "operations": 11997, 
      "size_in_bytes": 5555179 
     }, 
     "request_cache": { 
      "memory_size_in_bytes": 0, 
      "evictions": 0, 
      "hit_count": 195, 
      "miss_count": 195 
     }, 
     "recovery": { 
      "current_as_source": 0, 
      "current_as_target": 0, 
      "throttle_time_in_millis": 0 
     } 
     }, 
     "total": { 
     "docs": { 
      "count": 26860, 
      "deleted": 0 
     }, 
     "store": { 
      "size_in_bytes": 11027965678, 
      "throttle_time_in_millis": 0 
     }, 
     "indexing": { 
      "index_total": 27397, 
      "index_time_in_millis": 3568991, 
      "index_current": 1, 
      "index_failed": 0, 
      "delete_total": 0, 
      "delete_time_in_millis": 0, 
      "delete_current": 0, 
      "noop_update_total": 0, 
      "is_throttled": false, 
      "throttle_time_in_millis": 195961 
     }, 
     "get": { 
      "total": 0, 
      "time_in_millis": 0, 
      "exists_total": 0, 
      "exists_time_in_millis": 0, 
      "missing_total": 0, 
      "missing_time_in_millis": 0, 
      "current": 0 
     }, 
     "search": { 
      "open_contexts": 0, 
      "query_total": 55, 
      "query_time_in_millis": 294, 
      "query_current": 0, 
      "fetch_total": 36, 
      "fetch_time_in_millis": 3209, 
      "fetch_current": 0, 
      "scroll_total": 0, 
      "scroll_time_in_millis": 0, 
      "scroll_current": 0, 
      "suggest_total": 0, 
      "suggest_time_in_millis": 0, 
      "suggest_current": 0 
     }, 
     "merges": { 
      "current": 0, 
      "current_docs": 0, 
      "current_size_in_bytes": 0, 
      "total": 76, 
      "total_time_in_millis": 350987, 
      "total_docs": 45409, 
      "total_size_in_bytes": 4027595474, 
      "total_stopped_time_in_millis": 0, 
      "total_throttled_time_in_millis": 48633, 
      "total_auto_throttle_in_bytes": 82233108 
     }, 
     "refresh": { 
      "total": 857, 
      "total_time_in_millis": 2994887, 
      "listeners": 0 
     }, 
     "flush": { 
      "total": 15, 
      "total_time_in_millis": 291939 
     }, 
     "warmer": { 
      "current": 0, 
      "total": 876, 
      "total_time_in_millis": 534 
     }, 
     "query_cache": { 
      "memory_size_in_bytes": 0, 
      "total_count": 0, 
      "hit_count": 0, 
      "miss_count": 0, 
      "cache_size": 0, 
      "cache_count": 0, 
      "evictions": 0 
     }, 
     "fielddata": { 
      "memory_size_in_bytes": 24808, 
      "evictions": 0 
     }, 
     "completion": { 
      "size_in_bytes": 0 
     }, 
     "segments": { 
      "count": 139, 
      "memory_in_bytes": 186032131, 
      "terms_memory_in_bytes": 185758725, 
      "stored_fields_memory_in_bytes": 43976, 
      "term_vectors_memory_in_bytes": 0, 
      "norms_memory_in_bytes": 77888, 
      "points_memory_in_bytes": 714, 
      "doc_values_memory_in_bytes": 150828, 
      "index_writer_memory_in_bytes": 1316180948, 
      "version_map_memory_in_bytes": 42250, 
      "fixed_bit_set_memory_in_bytes": 0, 
      "max_unsafe_auto_id_timestamp": -1, 
      "file_sizes": { 

      } 
     }, 
     "translog": { 
      "operations": 11997, 
      "size_in_bytes": 5555179 
     }, 
     "request_cache": { 
      "memory_size_in_bytes": 0, 
      "evictions": 0, 
      "hit_count": 195, 
      "miss_count": 195 
     }, 
     "recovery": { 
      "current_as_source": 0, 
      "current_as_target": 0, 
      "throttle_time_in_millis": 0 
     } 
     } 
    } 
    } 
} 

EDIT 1 我发现了这个问题的根源。看起来这是错误的边框,尽管我不知道为什么。

一旦我从摄取的数据中删除边界框,索引是正常大小(600个文档 - > 550kb),但只要我重新添加边界框(带有全新索引),尺寸为天空(3,593个文档 - > 1.6GB),只有84个包含边框的文档。

下面是边框的JSON:

"placeBoundingBox": { 
    "type": "polygon", 
    "coordinates": [ 
     [ 
     [ 
      -71.191421, 
      42.227797 
     ], 
     [ 
      -71.191421, 
      42.399542 
     ], 
     [ 
      -70.986004, 
      42.399542 
     ], 
     [ 
      -70.986004, 
      42.227797 
     ], 
     [ 
      -71.191421, 
      42.227797 
     ] 
     ] 
    ] 
    } 

与边框相关的(从调用get/INDEX_NAME)的映射:

"placeBoundingBox": { 
    "type": "geo_shape", 
    "tree": "quadtree", 
    "precision": "1.0m" 
    } 

证明映射确实INFACT工作并且正在创建一个合适的geo_shape(尽管Kibana不会将其识别为geo_shape),但我运行了以下查询并获得了成功命中:

GET /_search 
{ 
    "query": { 
    "bool": { 
     "must": { 
     "match_all": { 

     } 
     }, 
     "filter": { 
     "geo_shape": { 
      "placeBoundingBox": { 
      "shape": { 
       "type": "polygon", 
       "coordinates": [ 
       [ 
        [ 
        -71.191421, 
        42.227797 
        ], 
        [ 
        -71.191421, 
        42.399542 
        ], 
        [ 
        -70.986004, 
        42.399542 
        ], 
        [ 
        -70.986004, 
        42.227797 
        ], 
        [ 
        -71.191421, 
        42.227797 
        ] 
       ] 
       ] 
      }, 
      "relation": "within" 
      } 
     } 
     } 
    } 
    } 
} 

我想将边界框保存在内,映射或数据是否有问题? 1.0米太细了吗?

+0

一些问题,以前的用例的碎片/副本呢?因为现在你有5个碎片和5个副本(如果我没有弄错),可能是这个原因,你看到的大小是一个大数字 – Mysterion

+0

我添加副本字段到映射,并将其设置为1,没有变化。 – Brooks

+0

你能把它设置为0吗?它会改变大小吗? – Mysterion

回答

0

问题在于映射的精确度,它只是一个错字(我们的Elasticsearch 2.x索引精度为1公里)。一个小小的字母使所有的差异...

1米(“1米”)的精度创造了一个非常臃肿的指数。

完全从映射中删除“精度”字段将默认为50米,并且大小合适的索引。