K8s CrashLoopBackOff排障记录 191114
来自三线的随记
场景:
部署 elasticsearch:6.8.2 + k8s + statefulsets/deplyments
[root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-2 NAME READY STATUS RESTARTS AGE elasticsearch-2 0/1 CrashLoopBackOff 7 15
纠结了很久,一开始觉得是应用或者镜像的问题,瞎乱调
后来调label,调resource,都没用
pod日志还就只有一条warning
[root@dce-con01 ~]# kubectl -n dmp logs elasticsearch-2 OpenJDK 64-Bit Server VM warning: Option UseConcMarkSweepGC was deprecated in version 9.0 and will likely be removed in a future release.
我人傻了。。。本着一定要把错误复现的牛角尖,然后居然还把对比另一个能够运行的 yaml 文件中的参数一条条替换到不能运行的 yuml 文件中,apply -> 再看结果
差点我就歇菜了
然后觉得是不是UI界面(一开始一直在dashboard上面只看到 crash, exited, restart什么鬼的 不靠谱了,看节点systemctl status和 journalctl 也没什么大问题 )
然后无意中一直执行 kubectl -n dmp get pods elasticsearch-2
等等???好像有什么不对的东西闪过了?
[root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 Running 1 10s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 Running 1 11s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 OOMKilled 1 12s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 OOMKilled 1 13s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 OOMKilled 1 14s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 OOMKilled 1 15s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 OOMKilled 1 16s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 OOMKilled 1 18s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 CrashLoopBackOff 1 19s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 CrashLoopBackOff 1 20s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 CrashLoopBackOff 1 22s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 CrashLoopBackOff 1 23s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 CrashLoopBackOff 1 24s [root@dce-con01 ~]# kubectl -n dmp get pods elasticsearch-0 NAME READY STATUS RESTARTS AGE elasticsearch-0 0/1 CrashLoopBackOff 1 25s
????OOMKilled?EXM?
[root@dce-con01 ~]# kubectl -n dmp describe pods elasticsearch-0
Name: elasticsearch-0
Namespace: dmp
Node: dce-node03/192.168.110.55
Start Time: Thu, 14 Nov 2019 14:10:54 +0800
Labels: app=elasticsearch
controller-revision-hash=elasticsearch-d88c99697
statefulset.kubernetes.io/pod-name=elasticsearch-0
Annotations: <none>
Status: Running
IP: 172.28.54.84
Controlled By: StatefulSet/elasticsearch
Init Containers:
chown:
Container ID: docker://0c38af8dc0ae715d39704287037234b491621e767095271d7d80d7554441686a
Image: 192.168.110.50/elastic/elasticsearch:6.8.2
Image ID: docker-pullable://192.168.110.50/elastic/elasticsearch@sha256:64c67fba27ddd3f2e817e5ba84a23cceb0c576ea545d8bbb9926a58937dc3c7c
Port: <none>
Host Port: <none>
Command:
/bin/bash
-c
set -e; set -x; chown elasticsearch:elasticsearch /usr/share/elasticsearch/data; for datadir in $(find /usr/share/elasticsearch/data -mindepth 1 -maxdepth 1 -not -name ".snapshot"); do
chown -R elasticsearch:elasticsearch $datadir;
done; chown elasticsearch:elasticsearch /usr/share/elasticsearch/logs; for logfile in $(find /usr/share/elasticsearch/logs -mindepth 1 -maxdepth 1 -not -name ".snapshot"); do
chown -R elasticsearch:elasticsearch $logfile;
done
State: Terminated
Reason: Completed
Exit Code: 0
Started: Thu, 14 Nov 2019 14:10:56 +0800
Finished: Thu, 14 Nov 2019 14:10:57 +0800
Ready: True
Restart Count: 0
Environment: <none>
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from default-token-2n968 (ro)
init-sysctl:
Container ID: docker://31fc7f90ea1b135eb197a9959dffc01333094f46233a209e45c1964704b790a3
Image: 192.168.110.50/elastic/elasticsearch:6.8.2
Image ID: docker-pullable://192.168.110.50/elastic/elasticsearch@sha256:64c67fba27ddd3f2e817e5ba84a23cceb0c576ea545d8bbb9926a58937dc3c7c
Port: <none>
Host Port: <none>
Command:
sysctl
-w
vm.max_map_count=262144
State: Terminated
Reason: Completed
Exit Code: 0
Started: Thu, 14 Nov 2019 14:10:58 +0800
Finished: Thu, 14 Nov 2019 14:10:58 +0800
Ready: True
Restart Count: 0
Environment: <none>
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from default-token-2n968 (ro)
Containers:
elasticsearch:
Container ID: docker://5db5657b05514666514db99ded724d182b1c9017f5d4b5e5343397d55fe50086
Image: 192.168.110.50/elastic/elasticsearch:6.8.2
Image ID: docker-pullable://192.168.110.50/elastic/elasticsearch@sha256:64c67fba27ddd3f2e817e5ba84a23cceb0c576ea545d8bbb9926a58937dc3c7c
Ports: 9200/TCP, 9300/TCP
Host Ports: 0/TCP, 0/TCP
State: Terminated
Reason: OOMKilled
Exit Code: 137
Started: Thu, 14 Nov 2019 14:14:21 +0800
Finished: Thu, 14 Nov 2019 14:14:23 +0800
Last State: Terminated
Reason: OOMKilled
Exit Code: 137
Started: Thu, 14 Nov 2019 14:12:51 +0800
Finished: Thu, 14 Nov 2019 14:12:53 +0800
Ready: False
Restart Count: 5
Limits:
cpu: 1
memory: 4Gi
Requests:
cpu: 1
memory: 4Gi
Liveness: http-get http://:9200/_cluster/health%3Flocal=true delay=60s timeout=5s period=20s #success=1 #failure=3
Readiness: http-get http://:9200/_cluster/health%3Flocal=true delay=60s timeout=5s period=20s #success=1 #failure=3
Environment:
ES_JAVA_OPTS: -Xms4g -Xmx4g
cluster.name: es
node.name: ${HOSTNAME}
bootstrap.memory_lock: false
discovery.zen.ping.unicast.hosts: elasticsearch-discovery
discovery.zen.minimum_master_nodes: 2
discovery.zen.ping_timeout: 5s
node.master: true
node.data: true
node.ingest: true
Mounts:
/var/run/secrets/kubernetes.io/serviceaccount from default-token-2n968 (ro)
Conditions:
Type Status
Initialized True
Ready False
ContainersReady False
PodScheduled True
Volumes:
default-token-2n968:
Type: Secret (a volume populated by a Secret)
SecretName: default-token-2n968
Optional: false
QoS Class: Guaranteed
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute for 300s
node.kubernetes.io/unreachable:NoExecute for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Scheduled 3m33s default-scheduler Successfully assigned dmp/elasticsearch-0 to dce-node03
Normal Pulled 3m31s kubelet, dce-node03 Container image "192.168.110.50/elastic/elasticsearch:6.8.2" already present on machine
Normal Created 3m31s kubelet, dce-node03 Created container chown
Normal Started 3m31s kubelet, dce-node03 Started container chown
Normal Pulled 3m30s kubelet, dce-node03 Container image "192.168.110.50/elastic/elasticsearch:6.8.2" already present on machine
Normal Created 3m29s kubelet, dce-node03 Created container init-sysctl
Normal Started 3m29s kubelet, dce-node03 Started container init-sysctl
Normal Created 3m3s (x3 over 3m28s) kubelet, dce-node03 Created container elasticsearch
Normal Started 3m3s (x3 over 3m28s) kubelet, dce-node03 Started container elasticsearch
Warning BackOff 2m43s (x5 over 3m21s) kubelet, dce-node03 Back-off restarting failed container
Normal Pulling 2m28s (x4 over 3m28s) kubelet, dce-node03 Pulling image "192.168.110.50/elastic/elasticsearch:6.8.2"
Normal Pulled 2m28s (x4 over 3m28s) kubelet, dce-node03 Successfully pulled image "192.168.110.50/elastic/elasticsearch:6.8.2"
[root@dce-con01 ~]#
如describe所示,他真的几乎就是一闪而过了,我滴龟龟
然后把Environment里面的java相关变量 -Xms4g -Xmx4g 减一半就好了
这里面有关JVM相关知识,先mark一下吧,待啃
【我太菜了】