ray-project · Oct 4, 2023 · Oct 4, 2023 · Oct 4, 2023 · Oct 6, 2023 · Oct 10, 2023
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -1,12 +1,9 @@
-name: 🦜🔍 Documentation build and deploy
+name: 🦜🔍 Documentation deploy
 
 on:
   push:
     branches:
     - master
-  pull_request:
-    branches:
-    - master
 
 permissions:
   contents: write

diff --git a/.github/workflows/docs_build.yaml b/.github/workflows/docs_build.yaml
@@ -0,0 +1,26 @@
+name: 🦜🔍 Documentation build
+on:
+  pull_request:
+    branches:
+    - master
+
+permissions:
+  contents: write
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.x
+      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
+      - uses: actions/cache@v3
+        with:
+          key: mkdocs-material-${{ env.cache_id }}
+          path: .cache
+          restore-keys: |
+            mkdocs-material-
+      - run: pip install mkdocs-material
+      - run: mkdocs build
diff --git a/.gitignore b/.gitignore
@@ -232,6 +232,7 @@ tag-mapping.json
 *.tmp
 deploy/anyscale/service.yaml
 out
+temp.py
 
 # build output
 build/
@@ -248,3 +249,7 @@ prompts.txt
 site/
 
 *.orig
+
+__pycache__
+
+.secretenv.yml
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ repos:
     hooks:
       - id: mypy
         # NOTE: Exclusions are handled in pyproject.toml
-        files: aviary
+        files: rayllm
         exclude: tests
         additional_dependencies:
           - mypy-extensions

diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,8 @@
 # syntax=docker/dockerfile:1.4
+# Note: TRTLLM backend is not included in the dockerfile, it is planned to be added in the future.
 
 ARG RAY_IMAGE="anyscale/ray"
-ARG RAY_TAG="2.7.0oss-py39-cu118"
+ARG RAY_TAG="2.9.0-py39-cu121"
 
 # Use Anyscale base image
 FROM ${RAY_IMAGE}:${RAY_TAG} AS aviary
@@ -16,18 +17,24 @@ ARG RAY_GID=100
 ENV RAY_SERVE_ENABLE_NEW_HANDLE_API=1
 ENV RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1
 ENV RAY_SERVE_ENABLE_JSON_LOGGING=1
+ENV RAY_SERVE_PROXY_PREFER_LOCAL_NODE_ROUTING=1
+ENV RAY_SERVE_HTTP_KEEP_ALIVE_TIMEOUT_S=310
+ENV RAY_metrics_report_batch_size=400
 
 ENV FORCE_CUDA=1
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 ENV SAFETENSORS_FAST_GPU=1
+ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:$LD_LIBRARY_PATH
+ENV OMPI_ALLOW_RUN_AS_ROOT=1
+ENV OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
 
 # Remove this line if we need the CUDA packages
 # and NVIDIA fixes their repository #ir-gleaming-sky
 RUN sudo rm -v /etc/apt/sources.list.d/cuda.list
 
 # Install torch first
 RUN pip install --no-cache-dir -U pip \
-    && pip install --no-cache-dir -i https://download.pytorch.org/whl/cu118 torch torchvision torchaudio \
+    && pip install --no-cache-dir -i https://download.pytorch.org/whl/cu121 torch~=2.1.0 torchvision torchaudio \
     && pip install --no-cache-dir tensorboard ninja
 
 # The build context should be the root of the repo
@@ -40,7 +47,7 @@ COPY --chown=${RAY_UID}:${RAY_GID} "./models/README.md" "${RAY_MODELS_DIR}/READM
 RUN cd "${RAY_DIST_DIR}" \
     # Update accelerate so transformers doesn't complain.
     && pip install --no-cache-dir -U accelerate \
-    && pip install --no-cache-dir -U "$(ls aviary-*.whl | head -n1)[frontend,backend]" \
+    && pip install --no-cache-dir -U "$(ls rayllm-*.whl | head -n1)[frontend,backend]" \
     # Purge caches
     && pip cache purge || true \
     && conda clean -a \

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,5 +1,5 @@
-include README.md README.ipynb LICENSE *.sh
+include README.md LICENSE *.sh
 recursive-include tests *.py
 recursive-include models *.yaml
 recursive-include examples *.*
-recursive-include aviary/frontend *.js
+recursive-include rayllm/frontend *.js
diff --git a/README.md b/README.md
@@ -1,8 +1,16 @@
+============================
+# Archiving Ray LLM
+
+We had started RayLLM to simplify setting up and deploying LLMs on top of Ray Serve. In the past few months, vLLM has made significant improvements in ease of use. We are archiving the RayLLM project and instead adding some examples to our [Ray Serve docs](https://docs.ray.io/en/master/serve/tutorials/vllm-example.html) for deploying LLMs with Ray Serve and vLLM. This will reduce another library for the community to learn about and greatly simplify the workflow to serve LLMs at scale. We also recently launched [Hosted Anyscale](https://www.anyscale.com/) where you can serve LLMs with Ray Serve with some more capabilities out of the box like multi-lora with serve multiplexing, JSON mode function calling and further performance enhancements.
+
+
+============================
 # RayLLM - LLMs on Ray
 
-[![Build status](https://badge.buildkite.com/d6d7af987d1db222827099a953410c4e212b32e8199ca513be.svg?branch=master)](https://buildkite.com/anyscale/aviary-docker)
+The hosted Aviary Explorer is not available anymore.
+Visit [Anyscale](https://endpoints.anyscale.com) to experience models served with RayLLM.
 
-Try it now: [🦜🔍 Ray Aviary Explorer 🦜🔍](http://aviary.anyscale.com/)
+[![Build status](https://badge.buildkite.com/d6d7af987d1db222827099a953410c4e212b32e8199ca513be.svg?branch=master)](https://buildkite.com/anyscale/aviary-docker)
 
 RayLLM (formerly known as Aviary) is an LLM serving solution that makes it easy to deploy and manage
 a variety of open source LLMs, built on [Ray Serve](https://docs.ray.io/en/latest/serve/index.html). It does this by:
@@ -15,10 +23,11 @@ a variety of open source LLMs, built on [Ray Serve](https://docs.ray.io/en/lates
 - Fully supporting multi-GPU & multi-node model deployments.
 - Offering high performance features like continuous batching, quantization and streaming.
 - Providing a REST API that is similar to OpenAI's to make it easy to migrate and cross test them.
+- Supporting multiple LLM backends out of the box, including [vLLM](https://github.com/vllm-project/vllm) and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM).
 
 In addition to LLM serving, it also includes a CLI and a web frontend (Aviary Explorer) that you can use to compare the outputs of different models directly, rank them by quality, get a cost and latency estimate, and more.
 
-RayLLM supports continuous batching by integrating with [vLLM](https://github.com/vllm-project/vllm). Continuous batching allows you to get much better throughput and latency than static batching.
+RayLLM supports continuous batching and quantization by integrating with [vLLM](https://github.com/vllm-project/vllm). Continuous batching allows you to get much better throughput and latency than static batching. Quantization allows you to deploy compressed models with cheaper hardware requirements and lower inference costs. See [quantization guide](models/continuous_batching/quantization/README.md) for more details on running quantized models on RayLLM.
 
 RayLLM leverages [Ray Serve](https://docs.ray.io/en/latest/serve/index.html), which has native support for autoscaling
 and multi-node deployments. RayLLM can scale to zero and create
@@ -32,14 +41,14 @@ The guide below walks you through the steps required for deployment of RayLLM on
 
 ### Locally
 
-We highly recommend using the official `anyscale/aviary` Docker image to run RayLLM. Manually installing RayLLM is currently not a supported use-case due to specific dependencies required, some of which are not available on pip.
+We highly recommend using the official `anyscale/ray-llm` Docker image to run RayLLM. Manually installing RayLLM is currently not a supported use-case due to specific dependencies required, some of which are not available on pip.
 
 ```shell
 cache_dir=${XDG_CACHE_HOME:-$HOME/.cache}
 
-docker run -it --gpus all --shm-size 1g -p 8000:8000 -e HF_HOME=~/data -v $cache_dir:~/data anyscale/aviary:latest bash
+docker run -it --gpus all --shm-size 1g -p 8000:8000 -e HF_HOME=~/data -v $cache_dir:~/data anyscale/ray-llm:latest bash
 # Inside docker container
-aviary run --model ~/models/continuous_batching/amazon--LightGPT.yaml
+serve run ~/serve_configs/amazon--LightGPT.yaml
 ```
 
 ### On a Ray Cluster
@@ -57,7 +66,7 @@ export AWS_SESSION_TOKEN=...
 
 Start by cloning this repo to your local machine.
 
-You may need to specify your AWS private key in the `deploy/ray/aviary-cluster.yaml` file.
+You may need to specify your AWS private key in the `deploy/ray/rayllm-cluster.yaml` file.
 See [Ray on Cloud VMs](https://docs.ray.io/en/latest/cluster/vms/index.html) page in
 Ray documentation for more details.
 
@@ -66,14 +75,14 @@ git clone https://github.com/ray-project/ray-llm.git
 cd ray-llm
 
 # Start a Ray Cluster (This will take a few minutes to start-up)
-ray up deploy/ray/aviary-cluster.yaml
+ray up deploy/ray/rayllm-cluster.yaml
 ```
 
 #### Connect to your Cluster
 
 ```shell
 # Connect to the Head node of your Ray Cluster (This will take several minutes to autoscale)
-ray attach deploy/ray/aviary-cluster.yaml
+ray attach deploy/ray/rayllm-cluster.yaml
 
 # Deploy the LightGPT model. 
 serve run serve_configs/amazon--LightGPT.yaml
@@ -84,14 +93,14 @@ or define your own model YAML file and run that instead.
 
 ### On Kubernetes
 
-For Kubernetes deployments, please see our extensive documentation for [deploying Ray Serve on KubeRay](https://docs.ray.io/en/latest/serve/production-guide/kubernetes.html).
+For Kubernetes deployments, please see our documentation for [deploying on KubeRay](https://github.com/ray-project/ray-llm/tree/master/docs/kuberay).
 
 ## Query your models
 
 Once the models are deployed, you can install a client outside of the Docker container to query the backend.
 
 ```shell
-pip install "aviary @ git+https://github.com/ray-project/ray-llm.git"
+pip install "rayllm @ git+https://github.com/ray-project/ray-llm.git"
 ```
 
 You can query your RayLLM deployment in many ways.
@@ -219,47 +228,19 @@ print(chat_completion)
 To install RayLLM and its dependencies, run the following command:
 
 ```shell
-pip install "aviary @ git+https://github.com/ray-project/ray-llm.git"
+pip install "rayllm @ git+https://github.com/ray-project/ray-llm.git"
 ```
 
 RayLLM consists of a set of configurations and utilities for deploying LLMs on Ray Serve,
 in addition to a frontend (Aviary Explorer), both of which come with additional
 dependencies. To install the dependencies for the frontend run the following commands:
 
 ```shell
-pip install "aviary[frontend] @ git+https://github.com/ray-project/ray-llm.git"
+pip install "rayllm[frontend] @ git+https://github.com/ray-project/ray-llm.git"
 ```
 
 The backend dependencies are heavy weight, and quite large. We recommend using the official
-`anyscale/aviary` image. Installing the backend manually is not a supported usecase.
-
-## Running Aviary Explorer locally
-
-The frontend is a [Gradio](https://gradio.app/) interface that allows you to interact
-with the models in the backend through a web interface.
-The Gradio app is served using [Ray Serve](https://docs.ray.io/en/latest/serve/index.html).
-
-To run the Aviary Explorer locally, you need to set the following environment variable:
-
-```shell
-export ENDPOINT_URL=<hostname of the backend, eg. 'http://localhost:8000'>
-```
-
-Once you have set these environment variables, you can run the frontend with the
-following command:
-
-```shell
-serve run aviary.frontend.app:app --non-blocking
-```
-
-You will be able to access it at `http://localhost:8000/frontend` in your browser.
-
-To just use the Gradio frontend without Ray Serve, you can start it
-with `python aviary/frontend/app.py`. In that case, the Gradio interface should be accessible at `http://localhost:7860` in your browser.
-If running the frontend yourself is not an option, you can still use
-[our hosted version](http://aviary.anyscale.com/) for your experiments.
-
-Note that the frontend will not dynamically update the list of models should they change in the backend. In order for the frontend to update, you will need to restart it.
+`anyscale/ray-llm` image. Installing the backend manually is not a supported usecase.
 
 ### Usage stats collection
 
@@ -307,7 +288,7 @@ Run multiple models at once by aggregating the Serve configs for different model
 
 applications:
 - name: router
-  import_path: aviary.backend:router_application
+  import_path: rayllm.backend:router_application
   route_prefix: /
   args:
     models:

diff --git a/aviary/__init__.py b/aviary/__init__.py
diff --git a/aviary/backend/__init__.py b/aviary/backend/__init__.py
diff --git a/aviary/backend/llm/dict_utils.py b/aviary/backend/llm/dict_utils.py