From c5cf920f8a64890b8db3ef56c1fdc696110498d2 Mon Sep 17 00:00:00 2001 From: cheyang Date: Fri, 30 Jun 2017 02:35:53 +0800 Subject: [PATCH] initial swarm support (#50) --- swarm/README.md | 46 ++++++++++++++++++++++++++++++++++ swarm/template.yaml.jinja | 52 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 swarm/README.md create mode 100644 swarm/template.yaml.jinja diff --git a/swarm/README.md b/swarm/README.md new file mode 100644 index 00000000..5edbfae4 --- /dev/null +++ b/swarm/README.md @@ -0,0 +1,46 @@ +# Running Distributed TensorFlow on Docker Compose v2 And Swarm + +## Prerequisite + +1. You must be running Docker 1.11 or above. See the + [Docker Documentation](https://docs.docker.com/v1.11/) if you + want to quickly setup a swarm cluster and compose from scratch. + +2. You'd better set up some shared storage such as HDFS in the cluster. If you'd like to deploy HDFS with docker, see [Run Hadoop Cluster in Docker](http://kiwenlau.blogspot.com/2015/05/quickly-build-arbitrary-size-hadoop.html) + +3. [Jinja templates](http://jinja.pocoo.org/) must be installed. + +Before you start, you need to set up a Docker Swarm cluster and Compose. It is also preferable to set up some shared storage such as HDFS. You need to know the HDFS namenode which is needed to bring up the TensorFlow cluster. + +## Steps to Run the job + +1. Follow the instructions for creating the training program in the parent + [README](../README.md). + +2. Follow the instructions for building and pushing the Docker image in the + [Docker README](../docker/README.md). + +3. Copy the template file: + + ```sh + cd ecosystem + cp swarm/template.yaml.jinja docker-compose.template.jinja + ``` + +4. Edit the `docker-compose.template.jinja` file to edit job parameters. You need to specify the `name`, `image_name`, `train_dir` and optionally change number of worker and ps replicas. The `train_dir` must point to the directory on shared storage if you would like to use TensorBoard or sharded checkpoint. + +5. Generate the compose file: + + ```sh + mkdir /distribute-tensorflow + python render_template.py docker-compose.template.jinja | tee /distribute-tensorflow/docker-compose.yml + ``` + +6. Run the TensorFlow Cluster + + + ```sh + cd /distribute-tensorflow + docker-compose up -d + ``` + diff --git a/swarm/template.yaml.jinja b/swarm/template.yaml.jinja new file mode 100644 index 00000000..ef70ed84 --- /dev/null +++ b/swarm/template.yaml.jinja @@ -0,0 +1,52 @@ +{%- set name = "mnist" -%} +{%- set image = "" -%} +{%- set worker_replicas = 2 -%} +{%- set ps_replicas = 1 -%} +{%- set script = "mnist.py" -%} +{%- set data_dir = "hdfs://namenode/data_dir" -%} +{%- set train_dir = "hdfs://namenode/train_dir" -%} +{%- set tensorboard = true %} + +{%- set port = 5000 -%} +{%- set replicas = {"worker": worker_replicas, "ps": ps_replicas} -%} + +{%- macro worker_hosts() -%} + {%- for i in range(worker_replicas) -%} + {%- if not loop.first -%},{%- endif -%} + {{ name }}-worker-{{ i }}:{{ port }} + {%- endfor -%} +{%- endmacro -%} + +{%- macro ps_hosts() -%} + {%- for i in range(ps_replicas) -%} + {%- if not loop.first -%},{%- endif -%} + {{ name }}-ps-{{ i }}:{{ port }} + {%- endfor -%} +{%- endmacro -%} + +version: '2' +services: + +{%- for job in ["worker", "ps"] -%} +{%- for i in range(replicas[job]) %} + {{ job }}-{{ i }}: + image: {{ image }} + container_name: {{ name }}-{{ job }}-{{ i }} + command: + - python + - {{ script }} + - "--data_dir={{ data_dir }}" + - "--train_dir={{ train_dir }}" + - "--task_index={{ i }}" + - "--job_name={{ job }}" + - "--worker_hosts={{ worker_hosts() }}" + - "--ps_hosts={{ ps_hosts() }}" +{% endfor %} +{%- endfor -%} +{%- if tensorboard %} + tensorboard: + image: {{ image }} + command: + - tensorboard + - --logdir={{ train_dir }} +{% endif %}