You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
[root@master dlrover]# kubectl -n dlrover logs elasticjob-fine-tuning-llama2-dlrover-master
[2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] port = 50001
[2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] node_num = 1
[2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] job_name = fine-tuning-llama2
[2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] namespace = dlrover
[2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] platform = pyk8s
[2024-03-19 08:38:52,474] [INFO] [factory.py:34:new_job_args] New pyk8s JobParameters
[2024-03-19 08:38:52,808] [INFO] [kubernetes.py:136:init] Load the incluster config.
[2024-03-19 08:39:07,849] [ERROR] [kubernetes.py:113:wrapper] Fail to execute get_custom_resource: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'Cache-Control': 'no-cache, private', 'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Kubernetes-Pf-Flowschema-Uid': 'f6b32834-c435-4ac5-ba60-c530b33bf5e5', 'X-Kubernetes-Pf-Prioritylevel-Uid': '48976338-b5fb-422d-9927-a7eb323d4ac0', 'Date': 'Tue, 19 Mar 2024 08:39:10 GMT', 'Content-Length': '428'})
HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"elasticjobs.elastic.iml.github.io "fine-tuning-llama2" is forbidden: User "system:serviceaccount:dlrover:default" cannot get resource "elasticjobs" in API group "elastic.iml.github.io" in the namespace "dlrover"","reason":"Forbidden","details":{"name":"fine-tuning-llama2","group":"elastic.iml.github.io","kind":"elasticjobs"},"code":403}
[2024-03-19 08:39:27,892] [ERROR] [kubernetes.py:113:wrapper] Fail to execute get_custom_resource: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'Cache-Control': 'no-cache, private', 'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Kubernetes-Pf-Flowschema-Uid': 'f6b32834-c435-4ac5-ba60-c530b33bf5e5', 'X-Kubernetes-Pf-Prioritylevel-Uid': '48976338-b5fb-422d-9927-a7eb323d4ac0', 'Date': 'Tue, 19 Mar 2024 08:39:30 GMT', 'Content-Length': '428'})
HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"elasticjobs.elastic.iml.github.io "fine-tuning-llama2" is forbidden: User "system:serviceaccount:dlrover:default" cannot get resource "elasticjobs" in API group "elastic.iml.github.io" in the namespace "dlrover"","reason":"Forbidden","details":{"name":"fine-tuning-llama2","group":"elastic.iml.github.io","kind":"elasticjobs"},"code":403}
[2024-03-19 08:39:47,929] [ERROR] [kubernetes.py:113:wrapper] Fail to execute get_custom_resource: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'Cache-Control': 'no-cache, private', 'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Kubernetes-Pf-Flowschema-Uid': 'f6b32834-c435-4ac5-ba60-c530b33bf5e5', 'X-Kubernetes-Pf-Prioritylevel-Uid': '48976338-b5fb-422d-9927-a7eb323d4ac0', 'Date': 'Tue, 19 Mar 2024 08:39:51 GMT', 'Content-Length': '428'})
HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"elasticjobs.elastic.iml.github.io "fine-tuning-llama2" is forbidden: User "system:serviceaccount:dlrover:default" cannot get resource "elasticjobs" in API group "elastic.iml.github.io" in the namespace "dlrover"","reason":"Forbidden","details":{"name":"fine-tuning-llama2","group":"elastic.iml.github.io","kind":"elasticjobs"},"code":403}
Traceback (most recent call last):
File "/usr/local/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/local/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.8/site-packages/dlrover/python/master/main.py", line 64, in
os._exit(main())
File "/usr/local/lib/python3.8/site-packages/dlrover/python/master/main.py", line 59, in main
exit_code = run(args)
File "/usr/local/lib/python3.8/site-packages/dlrover/python/master/main.py", line 39, in run
job_args.initilize()
File "/usr/local/lib/python3.8/site-packages/dlrover/python/scheduler/kubernetes.py", line 387, in initilize
job = self._retry_to_get_job(k8s_client)
File "/usr/local/lib/python3.8/site-packages/dlrover/python/scheduler/kubernetes.py", line 463, in _retry_to_get_job
raise ValueError("Cannot get the training job %s" % self.job_name)
ValueError: Cannot get the training job fine-tuning-llama2
[root@master dlrover]#
[root@master dlrover]# kubectl -n dlrover get deployment
NAME READY UP-TO-DATE AVAILABLE AGE
dlrover-controller-manager 1/1 1 1 25m
[root@master dlrover]#
[root@master dlrover]# kubectl -n dlrover get crd elasticjobs.elastic.iml.github.io
NAME CREATED AT
elasticjobs.elastic.iml.github.io 2024-03-19T08:24:29Z
[root@master dlrover]#
The text was updated successfully, but these errors were encountered:
You should execute kubectl -n dlrover apply -f dlrover/go/operator/config/manifests/bases/default-role.yaml to grant permission for the DLRover master to access CRDs.
执行 examples/pytorch/llama2/elastic_job.yaml 下面的demo ,运行报错了
kubectl -n dlrover apply -f examples/pytorch/llama2/elastic_job.yaml
[root@master dlrover]# kubectl -n dlrover logs elasticjob-fine-tuning-llama2-dlrover-master
[2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] port = 50001
[2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] node_num = 1
[2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] job_name = fine-tuning-llama2
[2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] namespace = dlrover
[2024-03-19 08:38:52,474] [INFO] [args.py:65:print_args] platform = pyk8s
[2024-03-19 08:38:52,474] [INFO] [factory.py:34:new_job_args] New pyk8s JobParameters
[2024-03-19 08:38:52,808] [INFO] [kubernetes.py:136:init] Load the incluster config.
[2024-03-19 08:39:07,849] [ERROR] [kubernetes.py:113:wrapper] Fail to execute get_custom_resource: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'Cache-Control': 'no-cache, private', 'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Kubernetes-Pf-Flowschema-Uid': 'f6b32834-c435-4ac5-ba60-c530b33bf5e5', 'X-Kubernetes-Pf-Prioritylevel-Uid': '48976338-b5fb-422d-9927-a7eb323d4ac0', 'Date': 'Tue, 19 Mar 2024 08:39:10 GMT', 'Content-Length': '428'})
HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"elasticjobs.elastic.iml.github.io "fine-tuning-llama2" is forbidden: User "system:serviceaccount:dlrover:default" cannot get resource "elasticjobs" in API group "elastic.iml.github.io" in the namespace "dlrover"","reason":"Forbidden","details":{"name":"fine-tuning-llama2","group":"elastic.iml.github.io","kind":"elasticjobs"},"code":403}
[2024-03-19 08:39:27,892] [ERROR] [kubernetes.py:113:wrapper] Fail to execute get_custom_resource: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'Cache-Control': 'no-cache, private', 'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Kubernetes-Pf-Flowschema-Uid': 'f6b32834-c435-4ac5-ba60-c530b33bf5e5', 'X-Kubernetes-Pf-Prioritylevel-Uid': '48976338-b5fb-422d-9927-a7eb323d4ac0', 'Date': 'Tue, 19 Mar 2024 08:39:30 GMT', 'Content-Length': '428'})
HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"elasticjobs.elastic.iml.github.io "fine-tuning-llama2" is forbidden: User "system:serviceaccount:dlrover:default" cannot get resource "elasticjobs" in API group "elastic.iml.github.io" in the namespace "dlrover"","reason":"Forbidden","details":{"name":"fine-tuning-llama2","group":"elastic.iml.github.io","kind":"elasticjobs"},"code":403}
[2024-03-19 08:39:47,929] [ERROR] [kubernetes.py:113:wrapper] Fail to execute get_custom_resource: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'Cache-Control': 'no-cache, private', 'Content-Type': 'application/json', 'X-Content-Type-Options': 'nosniff', 'X-Kubernetes-Pf-Flowschema-Uid': 'f6b32834-c435-4ac5-ba60-c530b33bf5e5', 'X-Kubernetes-Pf-Prioritylevel-Uid': '48976338-b5fb-422d-9927-a7eb323d4ac0', 'Date': 'Tue, 19 Mar 2024 08:39:51 GMT', 'Content-Length': '428'})
HTTP response body: {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"elasticjobs.elastic.iml.github.io "fine-tuning-llama2" is forbidden: User "system:serviceaccount:dlrover:default" cannot get resource "elasticjobs" in API group "elastic.iml.github.io" in the namespace "dlrover"","reason":"Forbidden","details":{"name":"fine-tuning-llama2","group":"elastic.iml.github.io","kind":"elasticjobs"},"code":403}
Traceback (most recent call last):
File "/usr/local/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/local/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.8/site-packages/dlrover/python/master/main.py", line 64, in
os._exit(main())
File "/usr/local/lib/python3.8/site-packages/dlrover/python/master/main.py", line 59, in main
exit_code = run(args)
File "/usr/local/lib/python3.8/site-packages/dlrover/python/master/main.py", line 39, in run
job_args.initilize()
File "/usr/local/lib/python3.8/site-packages/dlrover/python/scheduler/kubernetes.py", line 387, in initilize
job = self._retry_to_get_job(k8s_client)
File "/usr/local/lib/python3.8/site-packages/dlrover/python/scheduler/kubernetes.py", line 463, in _retry_to_get_job
raise ValueError("Cannot get the training job %s" % self.job_name)
ValueError: Cannot get the training job fine-tuning-llama2
[root@master dlrover]#
[root@master dlrover]# kubectl -n dlrover get deployment
NAME READY UP-TO-DATE AVAILABLE AGE
dlrover-controller-manager 1/1 1 1 25m
[root@master dlrover]#
[root@master dlrover]# kubectl -n dlrover get crd elasticjobs.elastic.iml.github.io
NAME CREATED AT
elasticjobs.elastic.iml.github.io 2024-03-19T08:24:29Z
[root@master dlrover]#
The text was updated successfully, but these errors were encountered: