https://github.com/tensorflow/models/blob/master/research/inception/inception/data/download_and_preprocess_imagenet.sh
which is linked from
https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Classification/RN50v1.5
which is linked from
https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
But the other files are needed: which are found in
https://github.com/tensorflow/models/tree/master/research/inception/inception/data
// I work in a directory: $ pwd /home/soh/Downloads
// I put those files in: /home/soh/Downloads/download_and_preprocess_imagenet.sh /home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/build_imagenet_data.py /home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/download_imagenet.sh /home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/imagenet_lsvrc_2015_synsets.txt /home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/preprocess_imagenet_validation_data.py /home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/imagenet_2012_validation_synset_labels.txt /home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/imagenet_metadata.txt /home/soh/Downloads/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/process_bounding_boxes.py
// in .py, change python to python3 since there is no python currently // for all .sy and .py, chmod +x // change /home/soh//works/tf/imagenet/download_and_preprocess_imagenet.sh as follows: ------------------------- file start #!/bin/bash # Copyright 2016 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==============================================================================
# Script to download and preprocess ImageNet Challenge 2012 # training and validation data set. # # The final output of this script are sharded TFRecord files containing # serialized Example protocol buffers. See build_imagenet_data.py for # details of how the Example protocol buffers contain the ImageNet data. # # The final output of this script appears as such: # # data_dir/train-00000-of-01024 # data_dir/train-00001-of-01024 # ... # data_dir/train-01023-of-01024 # # and # # data_dir/validation-00000-of-00128 # data_dir/validation-00001-of-00128 # ... # data_dir/validation-00127-of-00128 # # Note that this script may take several hours to run to completion. The # conversion of the ImageNet data to TFRecords alone takes 2-3 hours depending # on the speed of your machine. Please be patient. # # **IMPORTANT** # To download the raw images, the user must create an account with image-net.org # and generate a username and access_key. The latter two are required for # downloading the raw images. # # usage: # ./download_and_preprocess_imagenet.sh [data-dir] set -e
if [ -z "$1" ]; then echo "Usage: download_and_preprocess_imagenet.sh [data dir]" exit fi
# Create the output and temporary directories. DATA_DIR="${1%/}" SCRATCH_DIR="${DATA_DIR}/raw-data/" mkdir -p "${DATA_DIR}" mkdir -p "${SCRATCH_DIR}" WORK_DIR="$0.runfiles/inception/inception"
# Download the ImageNet data. LABELS_FILE="${WORK_DIR}/data/imagenet_lsvrc_2015_synsets.txt" DOWNLOAD_SCRIPT="${WORK_DIR}/data/download_imagenet.sh" "${DOWNLOAD_SCRIPT}" "${SCRATCH_DIR}" "${LABELS_FILE}"
# Note the locations of the train and validation data. TRAIN_DIRECTORY="${SCRATCH_DIR}train/" VALIDATION_DIRECTORY="${SCRATCH_DIR}validation/"
# Preprocess the validation data by moving the images into the appropriate # sub-directory based on the label (synset) of the image. echo "Organizing the validation data into sub-directories." PREPROCESS_VAL_SCRIPT="${WORK_DIR}/data/preprocess_imagenet_validation_data.py" VAL_LABELS_FILE="${WORK_DIR}/data/imagenet_2012_validation_synset_labels.txt"
"${PREPROCESS_VAL_SCRIPT}" "${VALIDATION_DIRECTORY}" "${VAL_LABELS_FILE}"
# Convert the XML files for bounding box annotations into a single CSV. echo "Extracting bounding box information from XML." BOUNDING_BOX_SCRIPT="${WORK_DIR}/data/process_bounding_boxes.py" BOUNDING_BOX_FILE="${SCRATCH_DIR}/imagenet_2012_bounding_boxes.csv" BOUNDING_BOX_DIR="${SCRATCH_DIR}bounding_boxes/"
"${BOUNDING_BOX_SCRIPT}" "${BOUNDING_BOX_DIR}" "${LABELS_FILE}" \ | sort > "${BOUNDING_BOX_FILE}" echo "Finished downloading and preprocessing the ImageNet data."
# Build the TFRecords version of the ImageNet data. #BUILD_SCRIPT="${WORK_DIR}/build_imagenet_data" BUILD_SCRIPT="${WORK_DIR}/build_imagenet_data.py" OUTPUT_DIRECTORY="${DATA_DIR}" IMAGENET_METADATA_FILE="${WORK_DIR}/data/imagenet_metadata.txt"
"${BUILD_SCRIPT}" \ --train_directory="${TRAIN_DIRECTORY}" \ --validation_directory="${VALIDATION_DIRECTORY}" \ --output_directory="${OUTPUT_DIRECTORY}" \ --imagenet_metadata_file="${IMAGENET_METADATA_FILE}" \ --labels_file="${LABELS_FILE}" \ --bounding_box_file="${BOUNDING_BOX_FILE}" ------------------------- file end
// change /home/soh//works/tf/imagenet/download_and_preprocess_imagenet.sh.runfiles/inception/inception/data/download_imagenet.sh as follows: ------------------------- file start #!/bin/bash # Copyright 2016 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ==============================================================================
# Script to download ImageNet Challenge 2012 training and validation data set. # # Downloads and decompresses raw images and bounding boxes. # # **IMPORTANT** # To download the raw images, the user must create an account with image-net.org # and generate a username and access_key. The latter two are required for # downloading the raw images. # # usage: # ./download_imagenet.sh [dir name] [synsets file] set -e
#if [ "x$IMAGENET_ACCESS_KEY" == x -o "x$IMAGENET_USERNAME" == x ]; then # cat <#In order to download the imagenet data, you have to create an account with #image-net.org. This will get you a username and an access key. You can set the #IMAGENET_USERNAME and IMAGENET_ACCESS_KEY environment variables, or you can #enter the credentials here. #END # read -p "Username: " IMAGENET_USERNAME # read -p "Access key: " IMAGENET_ACCESS_KEY #fi
OUTDIR="${1:-./imagenet-data}" SYNSETS_FILE="${2:-./synsets.txt}" #FILES_DIR="/home/soh/Downloads" FILES_DIR="/ws"
echo "Saving downloaded files to $OUTDIR" mkdir -p "${OUTDIR}" INITIAL_DIR=$(pwd) BBOX_DIR="${OUTDIR}bounding_boxes" mkdir -p "${BBOX_DIR}" cd "${OUTDIR}"
# Download and process all of the ImageNet bounding boxes. BASE_URL="http://www.image-net.org/challenges/LSVRC/2012/nonpub"
# See here for details: http://www.image-net.org/download-bboxes BOUNDING_BOX_ANNOTATIONS="${BASE_URL}/ILSVRC2012_bbox_train_v2.tar.gz" #BBOX_TAR_BALL="${BBOX_DIR}/annotations.tar.gz" BBOX_TAR_BALL="${FILES_DIR}/ILSVRC2012_bbox_train_v2.tar.gz" #echo "Saving bounding box annotations to $BBOX_TAR_BALL" #echo "Downloading bounding box annotations." #wget "${BOUNDING_BOX_ANNOTATIONS}" -O "${BBOX_TAR_BALL}" || BASE_URL_CHANGE=1 #if [ $BASE_URL_CHANGE ]; then # BASE_URL="http://www.image-net.org/challenges/LSVRC/2012/nnoupb" # BOUNDING_BOX_ANNOTATIONS="${BASE_URL}/ILSVRC2012_bbox_train_v2.tar.gz" # wget "${BOUNDING_BOX_ANNOTATIONS}" -O "${BBOX_TAR_BALL}" #fi echo "Uncompressing bounding box annotations ..." tar xzf "${BBOX_TAR_BALL}" -C "${BBOX_DIR}"
LABELS_ANNOTATED="${BBOX_DIR}/*" NUM_XML=$(ls -1 ${LABELS_ANNOTATED} | wc -l) echo "Identified ${NUM_XML} bounding box annotations."
# Download and uncompress all images from the ImageNet 2012 validation dataset. VALIDATION_TARBALL="ILSVRC2012_img_val.tar" OUTPUT_PATH="${OUTDIR}validation/" mkdir -p "${OUTPUT_PATH}" cd "${OUTDIR}/.." #echo "Downloading ${VALIDATION_TARBALL} to ${OUTPUT_PATH}." #wget -nd -c "${BASE_URL}/${VALIDATION_TARBALL}" echo "Extracting ${VALIDATION_TARBALL} to ${OUTPUT_PATH}." #tar xf "${VALIDATION_TARBALL}" -C "${OUTPUT_PATH}" tar xf "${FILES_DIR}/ILSVRC2012_img_val.tar" -C "${OUTPUT_PATH}"
# Download all images from the ImageNet 2012 train dataset. TRAIN_TARBALL="ILSVRC2012_img_train.tar" OUTPUT_PATH="${OUTDIR}train/" mkdir -p "${OUTPUT_PATH}" cd "${OUTDIR}/.." #echo "Downloading ${TRAIN_TARBALL} to ${OUTPUT_PATH}." #wget -nd -c "${BASE_URL}/${TRAIN_TARBALL}"
# Un-compress the individual tar-files within the train tar-file. echo "Uncompressing individual train tar-balls in the training data."
while read SYNSET; do echo "Processing: ${SYNSET}"
# Create a directory and delete anything there. mkdir -p "${OUTPUT_PATH}/${SYNSET}" rm -rf "${OUTPUT_PATH}/${SYNSET}/*"
# Uncompress into the directory. # tar xf "${TRAIN_TARBALL}" "${SYNSET}.tar" tar xf "${FILES_DIR}/ILSVRC2012_img_train.tar" "${SYNSET}.tar" tar xf "${SYNSET}.tar" -C "${OUTPUT_PATH}/${SYNSET}/" rm -f "${SYNSET}.tar"
echo "Finished processing: ${SYNSET}" done < "${SYNSETS_FILE}" #done < "${INITIAL_DIR}/${SYNSETS_FILE}" ------------------------- file end
// wget is not working. download tar or tar.gz with torrent // search the torrents in http://academictorrents.com with filenames and download torrents // downloaded files are /home/soh/Downloads/ILSVRC2012_bbox_train_v2.tar.gz (about 20MB) /home/soh/Downloads/ILSVRC2012_img_train.tar (about 148GB) /home/soh/Downloads/ILSVRC2012_img_val.tar (about 6.7GB)
// now required files are ready: // use absolute paths of sh and target data path // numpy, tensorflow are used to preprocess => use docker container
$ pwd /home/soh/Downloads
$ docker run --gpus all -it -v $PWD:/ws -w /ws tensorflow/tensorflow:1.14.0-gpu-py3 bash
// in docker container /ws $ /ws/download_and_preprocess_imagenet.sh /ws/imagenet1k
|