Elasticsearch 节点运维的那些套路,这篇是易懂的
一、节点相关命令
1、查看节点基本信息
GET /_nodes/<node_id>
GET /_nodes/<node_id>/process
返回 Response:
{ "_nodes" : { "total" : 1, "successful" : 1, "failed" : }, "cluster_name" : "es-xxxx", "nodes" : { "fdaOV16OQPq6-AUVihmq4A" : { "name" : "162680s31430001456s32", "transport_address" : "xx.0.96.22:9300", "host" : "xx.0.96.22", "ip" : "xx.0.96.22", "version" : "7.10.1", "build_flavor" : "default", "build_type" : "tar", "build_hash" : "119c2106d7bd8d206aa3b65dc43c87b8aa590b2b", "roles" : [ "master", "ml", "remote_cluster_client" ], "attributes" : { "ml.machine_memory" : "16478932992", "rack" : "cvm_4_200003", "xpack.installed" : "true", "set" : "200003", "transform.node" : "false", "ip" : "xx.20.58.221", "temperature" : "hot", "ml.max_open_jobs" : "20", "region" : "4" }, "process" : { "refresh_interval_in_millis" : 1000, "id" : 24918, "mlockall" : false } } }}
2、查看节点统计信息
GET /_nodes/statsGET /_nodes/<node_id>/statsGET/_nodes/stats/<metric>
总之,通过该 API,我们能够全方位获取到节点维度相关的各种指标信息。对于我们排查集群问题非常有帮助,我们还了解到腾讯云 ES 的部分大客户,通过定期去请求该API,将返回信息输出到对应的监控系统,来自己做更加细粒度的集群监控。
该API默认是返回节点所有的统计指标信息,如果我们需要查看部分指标或者特定指标统计信息,也可以在 API 中进行指定,如我们想查看特定节点的 JVM 使用情况:
GET /_nodes/1626803143000145632/stats/jvm
返回 Response 如下:
{ "_nodes" : { "total" : 1, "successful" : 1, "failed" : }, "cluster_name" : "es-xxx", "nodes" : { "fdaOV16OQPq6-AUVihmq4A" : { "timestamp" : 1639878427713, "name" : "1626803143000145632", "transport_address" : "xx.0.96.22:9300", "host" : "xx.0.96.22", "ip" : "xx.0.96.22:9300", "roles" : [ "master", "ml", "remote_cluster_client" ], "attributes" : { "ml.machine_memory" : "16478932992", "rack" : "cvm_4_200003", "xpack.installed" : "true", "set" : "200003", "transform.node" : "false", "ip" : "xx.20.58.221", "temperature" : "hot", "ml.max_open_jobs" : "20", "region" : "4" }, "jvm" : { "timestamp" : 1639878427713, "uptime_in_millis" : 13037976328, "mem" : { "heap_used_in_bytes" : 214161760, "heap_used_percent" : 2, "heap_committed_in_bytes" : 8555069440, "heap_max_in_bytes" : 8555069440, "non_heap_used_in_bytes" : 178981224, "non_heap_committed_in_bytes" : 195989504, "pools" : { "young" : { "used_in_bytes" : 104047328, "max_in_bytes" : 279183360, "peak_used_in_bytes" : 279183360, "peak_max_in_bytes" : 279183360 }, "survivor" : { "used_in_bytes" : 545616, "max_in_bytes" : 34865152, "peak_used_in_bytes" : 34865144, "peak_max_in_bytes" : 34865152 }, "old" : { "used_in_bytes" : 109568816, "max_in_bytes" : 8241020928, "peak_used_in_bytes" : 127845408, "peak_max_in_bytes" : 8241020928 } } }, "threads" : { "count" : 41, "peak_count" : 43 }, "gc" : { "collectors" : { "young" : { "collection_count" : 4718, "collection_time_in_millis" : 162825 }, "old" : { "collection_count" : 3, "collection_time_in_millis" : 359 } } }, "buffer_pools" : { "direct" : { "count" : 16, "used_in_bytes" : 4300808, "total_capacity_in_bytes" : 4300807 }, "mapped" : { "count" : , "used_in_bytes" : , "total_capacity_in_bytes" : } }, "classes" : { "current_loaded_count" : 21255, "total_loaded_count" : 21361, "total_unloaded_count" : 106 } } } }}
GET /_nodes/1626803143000145632/stats/indices/merge
返回 Response 如下:
{ "_nodes" : { "total" : 1, "successful" : 1, "failed" : }, "cluster_name" : "es-xxx", "nodes" : { "fdaOV16OQPq6-AUVihmq4A" : { "timestamp" : 1639878633590, "name" : "1626803143000145632", "transport_address" : "xx.0.96.22:9300", "host" : "xx.0.96.22", "ip" : "xx.0.96.22:9300", "roles" : [ "master", "ml", "remote_cluster_client" ], "attributes" : { "ml.machine_memory" : "16478932992", "rack" : "cvm_4_200003", "xpack.installed" : "true", "set" : "200003", "transform.node" : "false", "ip" : "xx.20.58.221", "temperature" : "hot", "ml.max_open_jobs" : "20", "region" : "4" }, "indices" : { "merges" : { "current" : , "current_docs" : , "current_size_in_bytes" : , "total" : , "total_time_in_millis" : , "total_docs" : , "total_size_in_bytes" : , "total_stopped_time_in_millis" : , "total_throttled_time_in_millis" : , "total_auto_throttle_in_bytes" : } } } }}
以及查看节点索引 segment 和 translog 统计信息:
GET /_nodes/1626803143000145632/stats/indices/segments,translog
{ "_nodes" : { "total" : 1, "successful" : 1, "failed" : }, "cluster_name" : "es-xxxx", "nodes" : { "fdaOV16OQPq6-AUVihmq4A" : { "timestamp" : 1639878746911, "name" : "1626803143000145632", "transport_address" : "xx.0.96.22:9300", "host" : "xx.0.96.22", "ip" : "xx.0.96.22:9300", "roles" : [ "master", "ml", "remote_cluster_client" ], "attributes" : { "ml.machine_memory" : "16478932992", "rack" : "cvm_4_200003", "xpack.installed" : "true", "set" : "200003", "transform.node" : "false", "ip" : "xx.20.58.221", "temperature" : "hot", "ml.max_open_jobs" : "20", "region" : "4" }, "indices" : { "segments" : { "count" : , "memory_in_bytes" : , "terms_memory_in_bytes" : , "stored_fields_memory_in_bytes" : , "term_vectors_memory_in_bytes" : , "norms_memory_in_bytes" : , "points_memory_in_bytes" : , "doc_values_memory_in_bytes" : , "index_writer_memory_in_bytes" : , "version_map_memory_in_bytes" : , "fixed_bit_set_memory_in_bytes" : , "max_unsafe_auto_id_timestamp" : -9223372036854775808, "file_sizes" : { } }, "translog" : { "operations" : , "size_in_bytes" : , "uncommitted_operations" : , "uncommitted_size_in_bytes" : , "earliest_last_modified_age" : } } } }}
也可以通过该API来查看每个节点上所分配的索引存储信息:
GET /_nodes/stats/indices/store
GET _nodes/stats API来查看节点的统计信息,ES 官方文档中还提供了另外一个 API,也可以获取到基本的统计信息:
GET /_cat/nodes
返回 Response:
ip heap.percent ram.percent cpu load_1m load_5m load_15m node.role master namexx.0.96.9 30 70 0 0.01 0.05 0.05 cdhilrstw - 1626803143000145432xx.0.96.24 2 99 1 0.11 0.09 0.07 lmr - 1626803143000145832xx.0.96.49 14 99 1 0.87 0.24 0.12 lmr * 1626803143000145732xx.0.96.13 14 70 0 0.16 0.08 0.06 cdhilrstw - 1626803143000145532xx.0.96.22 3 99 2 0.13 0.12 0.13 lmr - 1626803143000145632xx.0.96.20 cdhilrstw - 1626803143000145332
GET _cat/nodes?h=name,segments.memory,segments.index_writer_memory,heap.percent,fielddata.memory_size,query_cache.memory_size,request_cache.memory_size\&v
name segments.memory segments.index_writer_memory heap.percent fielddata.memory_size query_cache.memory_size1626803143000145832 0b 0b 3 0b 0b1626803143000145532 15.9mb 58mb 17 3.7kb 3.9kb1626803143000145332 1626803143000145432 15.7mb 50.7mb 28 3.4kb 10.2kb1626803143000145732 0b 0b 13 0b 0b1626803143000145632 0b 0b 3 0b 0b
3、查看节点线程池占用情况
GET /_cat/thread_pool
图1. 集群出现查询拒绝
图2. 集群节点查询队列被打满
GET /_cat/thread_pool/search,write?v
返回 Response:
node_name name active queue rejected1626803143000145832 search 0 0 01626803143000145832 write 1626803143000145532 search 0 0 01626803143000145532 write 1626803143000145332 search 0 0 01626803143000145332 write 1626803143000145432 search 0 0 01626803143000145432 write 1626803143000145732 search 0 0 01626803143000145732 write 1626803143000145632 search 0 0 01626803143000145632 write
如果能从如上的返回中看到 queue 值和 rejected 值比较高,就说明该节点的读写处理能力快到瓶颈了,此时应该结合cpu使用率来综合评估。以我们的经验来看。读写拒绝通常是由于 CPU 使用率高引起,CPU 使用率高会导致节点读写请求处理不过来,从而导致查询或 bulk 队列被打满而出现拒绝。而读写熔断通常是由于 JVM 使用率高引起。因此这里面需要针对不同的指标来进行分析。
4、查看节点热线程
GET /_nodes/hot_threadsGET /_nodes/<node_id>/hot_threads
返回 Response:
::: {1626803143000145832}{vT4YRHWdRweoouLn2fGu0g}{Pq2mklvJTvCMPlhN7OY_KQ}{xx.0.96.24}{xx.0.96.24:9300}{lmr}{ml.machine_memory=16478932992, rack=cvm_4_200003, xpack.installed=true, set=200003, transform.node=false, ip=9.20.59.20, temperature=hot, ml.max_open_jobs=20, region=4} Hot threads at 2021-12-19T02:21:12.334Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
::: {1626803143000145532}{CaQnhaYpQw6vbabGwKaPTw}{X_yKVAz9RHCOUwnMEXKLvg}{xx.0.96.13}{xx.0.96.13:9300}{cdhilrstw}{ml.machine_memory=50299387904, rack=cvm_4_200003, xpack.installed=true, set=200003, transform.node=true, ip=9.20.57.70, temperature=hot, ml.max_open_jobs=20, region=4} Hot threads at 2021-12-19T02:21:12.335Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
::: {1626803143000145332}{yvtpqeypTke6aFxxwYSjjA}{ZQkJVy7zQGOOY5_hAP3z_w}{xx.0.96.20}{xx.0.96.20:9300}{cdhilrstw}{ml.machine_memory=50299125760, rack=cvm_4_200003, xpack.installed=true, set=200003, transform.node=true, ip=9.20.53.190, temperature=hot, ml.max_open_jobs=20, region=4} Hot threads at 2021-12-19T02:21:12.337Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
::: {1626803143000145432}{hz6BqoupSuOUuWykrX5c2g}{VoANc_CJQWylc2rxT6NJTg}{xx.0.96.9}{xx.0.96.9:9300}{cdhilrstw}{ml.machine_memory=50299387904, rack=cvm_4_200003, xpack.installed=true, set=200003, transform.node=true, ip=9.20.56.176, temperature=hot, ml.max_open_jobs=20, region=4} Hot threads at 2021-12-19T02:21:12.334Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
::: {1626803143000145732}{AexOqq25T7SRf6tzMYzO1Q}{t21NHAcyQwiJKEtV85Okzg}{xx.0.96.49}{xx.0.96.49:9300}{lmr}{ml.machine_memory=16478932992, rack=cvm_4_200003, xpack.installed=true, set=200003, transform.node=false, ip=9.20.58.203, temperature=hot, ml.max_open_jobs=20, region=4} Hot threads at 2021-12-19T02:21:12.335Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
::: {1626803143000145632}{fdaOV16OQPq6-AUVihmq4A}{pdyApXIKQbivHe57M5UCIA}{xx.0.96.22}{xx.0.96.22:9300}{lmr}{ml.machine_memory=16478932992, rack=cvm_4_200003, xpack.installed=true, set=200003, transform.node=false, ip=9.20.58.221, temperature=hot, ml.max_open_jobs=20, region=4} Hot threads at 2021-12-19T02:21:12.335Z, interval=500ms, busiestThreads=3, ignoreIdleThreads=true:
二、节点常用命令总结
来源:https://cloud.tencent.com/developer/article/1921434
相关文章