Skip to content

Commit 44e448e

Browse files
authored
Nsight (#343)
* Added NCCL Nsight files Signed-off-by: Ankur Srivastava <awsankur@amazon.com> * Added NCCL Nsight files Signed-off-by: Ankur Srivastava <awsankur@amazon.com> * Updateed nccl and readme and fsdp Signed-off-by: Ankur Srivastava <awsankur@amazon.com> * Updated NCCL Signed-off-by: Ankur Srivastava <awsankur@amazon.com> * Added almost everything Signed-off-by: Ankur Srivastava <awsankur@amazon.com> * Removed token Signed-off-by: Ankur Srivastava <awsankur@amazon.com> --------- Signed-off-by: Ankur Srivastava <awsankur@amazon.com>
1 parent 765e627 commit 44e448e

22 files changed

+1269
-19
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
2+
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_api_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep
3+
4+
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_api_sync --input ${NSIGHT_REPORT_NAME}.nsys-rep
5+
6+
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_kern_pace --input ${NSIGHT_REPORT_NAME}.nsys-rep --name ncclDevKernel_ReduceScatter_Sum_f32_RING_LL
7+
8+
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_kern_pace --input ${NSIGHT_REPORT_NAME}.nsys-rep --name ncclDevKernel_AllGather_RING_LL
9+
10+
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_kern_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep
11+
12+
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_mem_size_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep
13+
14+
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_mem_time_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep
15+
16+
/fsx/nsight-efa/target-linux-x64/nsys recipe cuda_gpu_time_util_map --input ${NSIGHT_REPORT_NAME}.nsys-rep
17+
18+
/fsx/nsight-efa/target-linux-x64/nsys recipe nccl_sum --input ${NSIGHT_REPORT_NAME}.nsys-rep
19+
20+
/fsx/nsight-efa/target-linux-x64/nsys recipe nccl_gpu_time_util_map --input ${NSIGHT_REPORT_NAME}.nsys-rep
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
apiVersion: "kubeflow.org/v1"
2+
kind: PyTorchJob
3+
metadata:
4+
name: fsdp
5+
spec:
6+
elasticPolicy:
7+
rdzvBackend: etcd
8+
rdzvHost: etcd
9+
rdzvPort: 2379
10+
minReplicas: 1
11+
maxReplicas: 96
12+
maxRestarts: 100
13+
#metrics:
14+
# - type: Resource
15+
# resource:
16+
# name: cpu
17+
# target:
18+
# type: Utilization
19+
# averageUtilization: 80
20+
pytorchReplicaSpecs:
21+
Worker:
22+
replicas: 2
23+
restartPolicy: OnFailure
24+
template:
25+
metadata:
26+
labels:
27+
app: fsdp
28+
nvidia-devtools-sidecar-injector: enabled
29+
spec:
30+
volumes:
31+
- name: shmem
32+
#emptyDir:
33+
# medium: Memory
34+
hostPath:
35+
path: /dev/shm
36+
#nodeSelector:
37+
# node.kubernetes.io/instance-type: "p5.48xlarge"
38+
containers:
39+
- name: pytorch
40+
image: 159553542841.dkr.ecr.us-west-2.amazonaws.com/fsdp:llama2-efa-main-02-13
41+
imagePullPolicy: Always
42+
resources:
43+
requests:
44+
nvidia.com/gpu:
45+
vpc.amazonaws.com/efa: 4
46+
limits:
47+
nvidia.com/gpu:
48+
vpc.amazonaws.com/efa: 4
49+
env:
50+
# for P5 FI_* should be commented out
51+
#- name: LOGLEVEL
52+
# value: "DEBUG"
53+
- name: FI_PROVIDER
54+
value: efa
55+
- name: FI_EFA_USE_DEVICE_RDMA
56+
value: "1"
57+
- name: FI_EFA_FORK_SAFE
58+
value: "1"
59+
- name: FI_LOG_LEVEL
60+
value: "1"
61+
- name: FI_EFA_ENABLE_SHM_TRANSFER
62+
value: "1"
63+
#- name: NCCL_DEBUG
64+
# value: "INFO"
65+
- name: NCCL_ASYNC_ERROR_HANDLING
66+
value: "1"
67+
#- name: NCCL_IGNORE_DISABLED_P2P
68+
# value: "1"
69+
- name: HF_TOKEN
70+
value: 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
71+
command:
72+
- bash
73+
- -c
74+
- "torchrun --nproc_per_node=8 --nnodes=2 examples/finetuning.py --num_epochs=1 --batch_size_training=3 --enable_fsdp --pure_bf16 --model_name meta-llama/Llama-2-7b-hf --output_dir ."
75+
volumeMounts:
76+
- name: shmem
77+
mountPath: /dev/shm
78+
root@cb9511473ccc:/eks/deployment/distributed-training/pytorch/pytorchjob/fsdp# cat Dockerfile.llama2-efa
79+
FROM nvidia/cuda:12.2.2-devel-ubuntu22.04
80+
81+
ARG EFA_INSTALLER_VERSION=1.29.1
82+
ARG AWS_OFI_NCCL_VERSION=v1.7.3-aws
83+
ARG NCCL_TESTS_VERSION=master
84+
ARG NCCL_VERSION=2.18.5
85+
86+
RUN apt-get update -y
87+
RUN apt-get remove -y --allow-change-held-packages \
88+
libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev
89+
90+
RUN rm -rf /opt/hpcx \
91+
&& rm -rf /usr/local/mpi \
92+
&& rm -f /etc/ld.so.conf.d/hpcx.conf \
93+
&& ldconfig
94+
ENV OPAL_PREFIX=
95+
96+
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
97+
git \
98+
gcc \
99+
vim \
100+
kmod \
101+
openssh-client \
102+
openssh-server \
103+
build-essential \
104+
curl \
105+
autoconf \
106+
libtool \
107+
gdb \
108+
automake \
109+
python3-distutils \
110+
cmake \
111+
apt-utils \
112+
devscripts \
113+
debhelper \
114+
libsubunit-dev \
115+
check \
116+
pkg-config
117+
118+
RUN mkdir -p /var/run/sshd
119+
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
120+
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
121+
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
122+
ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
123+
ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
124+
125+
RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
126+
&& python3 /tmp/get-pip.py \
127+
&& pip3 install awscli pynvml
128+
129+
#################################################
130+
## Install NVIDIA GDRCopy
131+
#RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/gdrcopy \
132+
# && cd /opt/gdrcopy \
133+
# && make lib_install install \
134+
# && cd /opt/gdrcopy/tests \
135+
# && make \
136+
# && mv copylat copybw sanity apiperf /usr/bin/
137+
138+
#################################################
139+
## Install EFA installer
140+
RUN cd $HOME \
141+
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
142+
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
143+
&& cd aws-efa-installer \
144+
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
145+
&& rm -rf $HOME/aws-efa-installer
146+
147+
###################################################
148+
## Install NCCL
149+
RUN git clone https://github.com/NVIDIA/nccl -b v${NCCL_VERSION}-1 /opt/nccl \
150+
&& cd /opt/nccl \
151+
&& make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
152+
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
153+
154+
###################################################
155+
## Install AWS-OFI-NCCL plugin
156+
RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y
157+
RUN export OPAL_PREFIX="" \
158+
&& git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
159+
&& cd /opt/aws-ofi-nccl \
160+
&& git checkout ${AWS_OFI_NCCL_VERSION} \
161+
&& ./autogen.sh \
162+
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
163+
--with-libfabric=/opt/amazon/efa/ \
164+
--with-cuda=/usr/local/cuda \
165+
--with-nccl=/opt/nccl/build \
166+
--with-mpi=/opt/amazon/openmpi/ \
167+
&& make -j $(nproc) && make install
168+
###################################################
169+
## Install fsdp
170+
171+
RUN mkdir -p /workspace/
172+
173+
WORKDIR /workspace
174+
175+
#RUN git clone -b flop_counter https://github.com/facebookresearch/llama-recipes.git
176+
#RUN git clone -b flop_counter_gc https://github.com/facebookresearch/llama-recipes.git
177+
RUN git clone https://github.com/facebookresearch/llama-recipes.git
178+
179+
WORKDIR /workspace/llama-recipes
180+
181+
RUN pip3 install -U pip setuptools
182+
183+
RUN pip3 install fsspec==2023.1.0
184+
RUN pip3 install huggingface_hub==0.17.0
185+
RUN pip3 install -r requirements.txt
186+
187+
RUN pip3 install -e .
188+
189+
RUN pip3 install tabulate
190+
191+
RUN pip3 install protobuf
192+
193+
RUN pip3 install python-etcd
194+
195+
#RUN pip3 uninstall -y torch
196+
#RUN pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
197+
198+
ENV PYTHONPATH="${PYTHONPATH}:/workspace/llama-recipes/src"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# If we dont specify the Nsight image, 2024.2 version is used by default.
2+
# Will use 2024.4 version which is planned to be released by 5/24/2024
3+
devtoolBinariesImage:
4+
image: ${REGISTRY}.dkr.ecr.${REGION}.amazonaws.com/nsight-systems-cli:2024.4.1-ubuntu22.04
5+
imagePullPolicy: Always
6+
7+
# Assuming EKS cluster has a FSx for Lustre filesystem mounted on it. Nsight reports will be saved in /fsx_shared
8+
profile:
9+
volumes:
10+
[
11+
{
12+
"name": "nsys-output-volume",
13+
"persistentVolumeClaim": { "claimName": "fsx-pvc" }
14+
}
15+
]
16+
volumeMounts:
17+
[
18+
{
19+
"name": "nsys-output-volume",
20+
"mountPath": "/fsx_shared"
21+
}
22+
]
23+
24+
# CLI options: https://docs.nvidia.com/nsight-systems/UserGuide/index.html#cli-command-switches
25+
# delay and duration values in secs
26+
27+
# Use %{} to include environment variables in the Nsight report filename
28+
29+
# The arguments for the Nsight Systems. The placeholders will be replaced with the actual values.
30+
devtoolArgs: "profile --force-overwrite true --trace nvtx,cuda --delay 150 --duration 60 \
31+
-o /fsx_shared/fsdp/auto_{PROCESS_NAME}_%{POD_FULLNAME}_%{CONTAINER_NAME}_{TIMESTAMP}_{UID}.nsys-rep"
32+
33+
injectionMatch: "^/usr/bin/python3 /usr/local/bin/torchrun.*$"
34+
#injectionMatch: "^.*torchrun.*$"
35+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
apiVersion: "kubeflow.org/v1"
2+
kind: PyTorchJob
3+
metadata:
4+
name: fsdp
5+
spec:
6+
elasticPolicy:
7+
rdzvBackend: etcd
8+
rdzvHost: etcd
9+
rdzvPort: 2379
10+
minReplicas: 1
11+
maxReplicas: 96
12+
maxRestarts: 100
13+
#metrics:
14+
# - type: Resource
15+
# resource:
16+
# name: cpu
17+
# target:
18+
# type: Utilization
19+
# averageUtilization: 80
20+
pytorchReplicaSpecs:
21+
Worker:
22+
replicas: 2
23+
restartPolicy: OnFailure
24+
template:
25+
metadata:
26+
labels:
27+
app: fsdp
28+
nvidia-devtools-sidecar-injector: enabled
29+
spec:
30+
volumes:
31+
- name: shmem
32+
#emptyDir:
33+
# medium: Memory
34+
hostPath:
35+
path: /dev/shm
36+
#nodeSelector:
37+
# node.kubernetes.io/instance-type: "p5.48xlarge"
38+
containers:
39+
- name: pytorch
40+
image: 159553542841.dkr.ecr.us-west-2.amazonaws.com/fsdp:llama2-efa-main-02-13
41+
imagePullPolicy: Always
42+
resources:
43+
requests:
44+
nvidia.com/gpu:
45+
vpc.amazonaws.com/efa: 4
46+
limits:
47+
nvidia.com/gpu:
48+
vpc.amazonaws.com/efa: 4
49+
env:
50+
# for P5 FI_* should be commented out
51+
#- name: LOGLEVEL
52+
# value: "DEBUG"
53+
- name: FI_PROVIDER
54+
value: efa
55+
- name: FI_EFA_USE_DEVICE_RDMA
56+
value: "1"
57+
- name: FI_EFA_FORK_SAFE
58+
value: "1"
59+
- name: FI_LOG_LEVEL
60+
value: "1"
61+
- name: FI_EFA_ENABLE_SHM_TRANSFER
62+
value: "1"
63+
#- name: NCCL_DEBUG
64+
# value: "INFO"
65+
- name: NCCL_ASYNC_ERROR_HANDLING
66+
value: "1"
67+
#- name: NCCL_IGNORE_DISABLED_P2P
68+
# value: "1"
69+
- name: HF_TOKEN
70+
value: <HF_token>
71+
command:
72+
- bash
73+
- -c
74+
- "torchrun --nproc_per_node=8 --nnodes=2 examples/finetuning.py --num_epochs=1 --batch_size_training=3 --enable_fsdp --pure_bf16 --model_name meta-llama/Llama-2-7b-hf --output_dir ."
75+
volumeMounts:
76+
- name: shmem
77+
mountPath: /dev/shm
Loading
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/bash -x
2+
3+
helm install -f custom_values.yaml \
4+
devtools-sidecar-injector https://helm.ngc.nvidia.com/nvidia/devtools/charts/devtools-sidecar-injector-1.0.0.tgz
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#!/bin/bash -x
2+
kubectl label namespaces ${example-ns} nvidia-devtools-sidecar-injector=enabled --overwrite=true
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/bin/bash -x
2+
3+
# kubectl cp -n <namespace> <pod-name>:<path> <destination-on-local-system>
4+
5+
FILE=auto_python3_default_fsdp-worker-1_pytorch_1715996702335_5a061871.nsys-rep
6+
7+
kubectl cp fsx-share-test:fsx_shared/fsdp/$FILE /eks/deployment/distributed-training/pytorch/pytorchjob/fsdp/$FILE
8+
9+
aws s3 cp $FILE s3://${S3_BUCKET}
10+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash -x
2+
3+
4+
helm uninstall devtools-sidecar-injector
5+
6+
kubectl delete namespace nvidia-devtools-sidecar-injector
7+
8+
kubectl delete mutatingwebhookconfigurations sidecar-injector-webhook
9+
kubectl delete mutatingwebhookconfiguration nvidia-devtools-sidecar-injector-webhook
10+
11+
kubectl delete cm -n example-ns nvidia-devtools-sidecar-injector
12+
kubectl delete cm -n example-ns nvidia-devtools-sidecar-injector-custom
13+
14+
kubectl delete cm nvidia-devtools-sidecar-injector
15+
kubectl delete cm nvidia-devtools-sidecar-injector-custom
16+
17+
#kubectl get all --all-namespaces -l nvidia-devtools-sidecar-injector=enabled -o custom-columns=:.metadata.name,NS:.metadata.namespace,KIND:.kind --no-headers | while read name namespace >

0 commit comments

Comments
 (0)