1
+ apiVersion: "kubeflow.org/v1"
2
+ kind: PyTorchJob
3
+ metadata:
4
+ name: fsdp
5
+ spec:
6
+ elasticPolicy:
7
+ rdzvBackend: etcd
8
+ rdzvHost: etcd
9
+ rdzvPort: 2379
10
+ minReplicas: 1
11
+ maxReplicas: 96
12
+ maxRestarts: 100
13
+ #metrics:
14
+ # - type: Resource
15
+ # resource:
16
+ # name: cpu
17
+ # target:
18
+ # type: Utilization
19
+ # averageUtilization: 80
20
+ pytorchReplicaSpecs:
21
+ Worker:
22
+ replicas: 2
23
+ restartPolicy: OnFailure
24
+ template:
25
+ metadata:
26
+ labels:
27
+ app: fsdp
28
+ nvidia-devtools-sidecar-injector: enabled
29
+ spec:
30
+ volumes:
31
+ - name: shmem
32
+ #emptyDir:
33
+ # medium: Memory
34
+ hostPath:
35
+ path: /dev/shm
36
+ #nodeSelector:
37
+ # node.kubernetes.io/instance-type: "p5.48xlarge"
38
+ containers:
39
+ - name: pytorch
40
+ image: 159553542841.dkr.ecr.us-west-2.amazonaws.com/fsdp:llama2-efa-main-02-13
41
+ imagePullPolicy: Always
42
+ resources:
43
+ requests:
44
+ nvidia.com/gpu:
45
+ vpc.amazonaws.com/efa: 4
46
+ limits:
47
+ nvidia.com/gpu:
48
+ vpc.amazonaws.com/efa: 4
49
+ env:
50
+ # for P5 FI_* should be commented out
51
+ #- name: LOGLEVEL
52
+ # value: "DEBUG"
53
+ - name: FI_PROVIDER
54
+ value: efa
55
+ - name: FI_EFA_USE_DEVICE_RDMA
56
+ value: "1"
57
+ - name: FI_EFA_FORK_SAFE
58
+ value: "1"
59
+ - name: FI_LOG_LEVEL
60
+ value: "1"
61
+ - name: FI_EFA_ENABLE_SHM_TRANSFER
62
+ value: "1"
63
+ #- name: NCCL_DEBUG
64
+ # value: "INFO"
65
+ - name: NCCL_ASYNC_ERROR_HANDLING
66
+ value: "1"
67
+ #- name: NCCL_IGNORE_DISABLED_P2P
68
+ # value: "1"
69
+ - name: HF_TOKEN
70
+ value: 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
71
+ command:
72
+ - bash
73
+ - -c
74
+ - "torchrun --nproc_per_node=8 --nnodes=2 examples/finetuning.py --num_epochs=1 --batch_size_training=3 --enable_fsdp --pure_bf16 --model_name meta-llama/Llama-2-7b-hf --output_dir ."
75
+ volumeMounts:
76
+ - name: shmem
77
+ mountPath: /dev/shm
78
+ root@cb9511473ccc:/eks/deployment/distributed-training/pytorch/pytorchjob/fsdp# cat Dockerfile.llama2-efa
79
+ FROM nvidia/cuda:12.2.2-devel-ubuntu22.04
80
+
81
+ ARG EFA_INSTALLER_VERSION=1.29.1
82
+ ARG AWS_OFI_NCCL_VERSION=v1.7.3-aws
83
+ ARG NCCL_TESTS_VERSION=master
84
+ ARG NCCL_VERSION=2.18.5
85
+
86
+ RUN apt-get update -y
87
+ RUN apt-get remove -y --allow-change-held-packages \
88
+ libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev
89
+
90
+ RUN rm -rf /opt/hpcx \
91
+ && rm -rf /usr/local/mpi \
92
+ && rm -f /etc/ld.so.conf.d/hpcx.conf \
93
+ && ldconfig
94
+ ENV OPAL_PREFIX=
95
+
96
+ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
97
+ git \
98
+ gcc \
99
+ vim \
100
+ kmod \
101
+ openssh-client \
102
+ openssh-server \
103
+ build-essential \
104
+ curl \
105
+ autoconf \
106
+ libtool \
107
+ gdb \
108
+ automake \
109
+ python3-distutils \
110
+ cmake \
111
+ apt-utils \
112
+ devscripts \
113
+ debhelper \
114
+ libsubunit-dev \
115
+ check \
116
+ pkg-config
117
+
118
+ RUN mkdir -p /var/run/sshd
119
+ RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
120
+ echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
121
+ sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
122
+ ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
123
+ ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
124
+
125
+ RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
126
+ && python3 /tmp/get-pip.py \
127
+ && pip3 install awscli pynvml
128
+
129
+ #################################################
130
+ ## Install NVIDIA GDRCopy
131
+ #RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/gdrcopy \
132
+ # && cd /opt/gdrcopy \
133
+ # && make lib_install install \
134
+ # && cd /opt/gdrcopy/tests \
135
+ # && make \
136
+ # && mv copylat copybw sanity apiperf /usr/bin/
137
+
138
+ #################################################
139
+ ## Install EFA installer
140
+ RUN cd $HOME \
141
+ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
142
+ && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
143
+ && cd aws-efa-installer \
144
+ && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
145
+ && rm -rf $HOME/aws-efa-installer
146
+
147
+ ###################################################
148
+ ## Install NCCL
149
+ RUN git clone https://github.com/NVIDIA/nccl -b v${NCCL_VERSION}-1 /opt/nccl \
150
+ && cd /opt/nccl \
151
+ && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
152
+ NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
153
+
154
+ ###################################################
155
+ ## Install AWS-OFI-NCCL plugin
156
+ RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y
157
+ RUN export OPAL_PREFIX="" \
158
+ && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
159
+ && cd /opt/aws-ofi-nccl \
160
+ && git checkout ${AWS_OFI_NCCL_VERSION} \
161
+ && ./autogen.sh \
162
+ && ./configure --prefix=/opt/aws-ofi-nccl/install \
163
+ --with-libfabric=/opt/amazon/efa/ \
164
+ --with-cuda=/usr/local/cuda \
165
+ --with-nccl=/opt/nccl/build \
166
+ --with-mpi=/opt/amazon/openmpi/ \
167
+ && make -j $(nproc) && make install
168
+ ###################################################
169
+ ## Install fsdp
170
+
171
+ RUN mkdir -p /workspace/
172
+
173
+ WORKDIR /workspace
174
+
175
+ #RUN git clone -b flop_counter https://github.com/facebookresearch/llama-recipes.git
176
+ #RUN git clone -b flop_counter_gc https://github.com/facebookresearch/llama-recipes.git
177
+ RUN git clone https://github.com/facebookresearch/llama-recipes.git
178
+
179
+ WORKDIR /workspace/llama-recipes
180
+
181
+ RUN pip3 install -U pip setuptools
182
+
183
+ RUN pip3 install fsspec==2023.1.0
184
+ RUN pip3 install huggingface_hub==0.17.0
185
+ RUN pip3 install -r requirements.txt
186
+
187
+ RUN pip3 install -e .
188
+
189
+ RUN pip3 install tabulate
190
+
191
+ RUN pip3 install protobuf
192
+
193
+ RUN pip3 install python-etcd
194
+
195
+ #RUN pip3 uninstall -y torch
196
+ #RUN pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
197
+
198
+ ENV PYTHONPATH="${PYTHONPATH}:/workspace/llama-recipes/src"
0 commit comments