|
| 1 | +import asyncio |
| 2 | +import subprocess |
| 3 | +import tempfile |
| 4 | +import time |
| 5 | + |
| 6 | +import pytest |
| 7 | + |
| 8 | +from tensorrt_llm.llmapi.disagg_utils import (DisaggClusterConfig, |
| 9 | + MinimalInstances, ServerRole) |
| 10 | +from tensorrt_llm.serve.auto_scaling import ClusterManager, ClusterWorker |
| 11 | +from tensorrt_llm.serve.cluster_storage import (WatchEventType, |
| 12 | + create_cluster_storage, |
| 13 | + create_cluster_storage_client) |
| 14 | + |
| 15 | +from .test_cluster_storage import http_server_storage, pytest_async_fixture |
| 16 | + |
| 17 | +INACTIVE_TIMEOUT = 4 |
| 18 | +HEARTBEAT_INTERVAL = 2 |
| 19 | + |
| 20 | +storage_types = ["http"] |
| 21 | + |
| 22 | + |
| 23 | +def get_uri(storage_type): |
| 24 | + if storage_type == "http": |
| 25 | + return f"http://localhost:18000" |
| 26 | + elif storage_type == "etcd": |
| 27 | + return f"etcd://localhost:2379" |
| 28 | + else: |
| 29 | + raise ValueError(f"Invalid storage type: {storage_type}") |
| 30 | + |
| 31 | + |
| 32 | +@pytest.fixture(scope="module") |
| 33 | +def config(request): |
| 34 | + cluster_uri = get_uri(request.param) |
| 35 | + return DisaggClusterConfig(cluster_uri=cluster_uri, |
| 36 | + cluster_name="test", |
| 37 | + minimal_instances=MinimalInstances( |
| 38 | + context_servers=1, generation_servers=1), |
| 39 | + inactive_timeout=INACTIVE_TIMEOUT, |
| 40 | + heartbeat_interval=HEARTBEAT_INTERVAL) |
| 41 | + |
| 42 | + |
| 43 | +@pytest.fixture(scope="module") |
| 44 | +def storage_server(config): |
| 45 | + if config.cluster_uri.startswith("http"): |
| 46 | + port = 18000 |
| 47 | + server, cluster_storage = http_server_storage(port) |
| 48 | + with server.run_in_thread(): |
| 49 | + yield cluster_storage, config.cluster_uri |
| 50 | + elif config.cluster_uri.startswith("etcd"): |
| 51 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 52 | + etcd = subprocess.Popen( |
| 53 | + ["etcd", "--data-dir", temp_dir, "--log-level", "debug"]) |
| 54 | + time.sleep(2) # wait for etcd to start |
| 55 | + yield create_cluster_storage( |
| 56 | + config.cluster_uri, config.cluster_name), config.cluster_uri |
| 57 | + etcd.kill() |
| 58 | + etcd.wait() |
| 59 | + else: |
| 60 | + raise ValueError(f"Invalid cluster storage URI: {config.cluster_uri}") |
| 61 | + |
| 62 | + |
| 63 | +@pytest_async_fixture(scope="module") |
| 64 | +async def storage_client(storage_server): |
| 65 | + _, cluster_uri = storage_server |
| 66 | + return create_cluster_storage_client(cluster_uri, "test") |
| 67 | + |
| 68 | + |
| 69 | +@pytest_async_fixture(scope="module") |
| 70 | +async def cluster_manager(config, storage_server): |
| 71 | + storage, cluster_uri = storage_server |
| 72 | + manager = ClusterManager(config, storage) |
| 73 | + await manager.start() |
| 74 | + yield manager |
| 75 | + await manager.stop() |
| 76 | + |
| 77 | + |
| 78 | +@pytest.mark.parametrize("config", storage_types, indirect=True) |
| 79 | +@pytest.mark.threadleak(enabled=False) |
| 80 | +@pytest.mark.asyncio(scope="module") |
| 81 | +async def test_init_workers_first(config, storage_server): |
| 82 | + try: |
| 83 | + # init workers before initializing the manager, so the manager should be able to |
| 84 | + # get the pre-registered workers |
| 85 | + server, storage_uri = storage_server |
| 86 | + storage_client = create_cluster_storage_client(storage_uri, "test") |
| 87 | + ctx_worker = ClusterWorker(ServerRole.CONTEXT, "127.0.0.1", 8001, |
| 88 | + config, storage_client) |
| 89 | + gen_worker = ClusterWorker(ServerRole.GENERATION, "127.0.0.1", 8002, |
| 90 | + config, storage_client) |
| 91 | + await ctx_worker.register_worker() |
| 92 | + await gen_worker.register_worker() |
| 93 | + |
| 94 | + cluster_manager = ClusterManager(config, server) |
| 95 | + await cluster_manager.start() |
| 96 | + existing_workers = await cluster_manager.watch_workers( |
| 97 | + get_existing_first=True) |
| 98 | + assert set([worker.worker_id for worker in existing_workers]) == { |
| 99 | + ctx_worker.worker_id, |
| 100 | + gen_worker.worker_id, |
| 101 | + } |
| 102 | + |
| 103 | + assert await cluster_manager.is_ready() == True |
| 104 | + finally: |
| 105 | + await ctx_worker.deregister_worker() |
| 106 | + await gen_worker.deregister_worker() |
| 107 | + |
| 108 | + |
| 109 | +@pytest.mark.parametrize("config", storage_types, indirect=True) |
| 110 | +@pytest.mark.threadleak(enabled=False) |
| 111 | +@pytest.mark.timeout(20) |
| 112 | +@pytest.mark.asyncio(scope="module") |
| 113 | +async def test_cluster_manager(cluster_manager, storage_client, config): |
| 114 | + try: |
| 115 | + cluster_manager.current_ctx_worker_num == 0 |
| 116 | + cluster_manager.current_gen_worker_num == 0 |
| 117 | + await cluster_manager.watch_workers() |
| 118 | + try: |
| 119 | + await asyncio.wait_for(cluster_manager.get_worker_events(), |
| 120 | + timeout=1) |
| 121 | + except asyncio.TimeoutError: |
| 122 | + pass |
| 123 | + assert await cluster_manager.is_ready() == False |
| 124 | + |
| 125 | + ctx_worker = ClusterWorker(ServerRole.CONTEXT, "127.0.0.1", 8001, |
| 126 | + config, storage_client) |
| 127 | + await cluster_manager.watch_workers() |
| 128 | + await ctx_worker.register_worker() |
| 129 | + worker_events = await cluster_manager.get_worker_events() |
| 130 | + assert worker_events == [(ctx_worker.worker_info, WatchEventType.SET)] |
| 131 | + assert cluster_manager.current_ctx_worker_num == 1 |
| 132 | + assert cluster_manager.current_gen_worker_num == 0 |
| 133 | + assert await cluster_manager.is_ready() == False |
| 134 | + |
| 135 | + gen_worker = ClusterWorker(ServerRole.GENERATION, "127.0.0.1", 8002, |
| 136 | + config, storage_client) |
| 137 | + await gen_worker.register_worker() |
| 138 | + worker_events = await cluster_manager.get_worker_events() |
| 139 | + assert worker_events == [(gen_worker.worker_info, WatchEventType.SET)] |
| 140 | + assert cluster_manager.current_ctx_worker_num == 1 |
| 141 | + assert cluster_manager.current_gen_worker_num == 1 |
| 142 | + assert await cluster_manager.is_ready() == True |
| 143 | + |
| 144 | + await ctx_worker.deregister_worker() |
| 145 | + worker_events = await cluster_manager.get_worker_events() |
| 146 | + assert worker_events == [(ctx_worker.worker_info, WatchEventType.DELETE) |
| 147 | + ] |
| 148 | + assert cluster_manager.current_ctx_worker_num == 0 |
| 149 | + assert cluster_manager.current_gen_worker_num == 1 |
| 150 | + assert await cluster_manager.is_ready() == False |
| 151 | + |
| 152 | + await gen_worker.deregister_worker() |
| 153 | + worker_events = await cluster_manager.get_worker_events() |
| 154 | + assert worker_events == [(gen_worker.worker_info, WatchEventType.DELETE) |
| 155 | + ] |
| 156 | + assert cluster_manager.current_ctx_worker_num == 0 |
| 157 | + assert cluster_manager.current_gen_worker_num == 0 |
| 158 | + assert await cluster_manager.is_ready() == False |
| 159 | + finally: |
| 160 | + await ctx_worker.deregister_worker() |
| 161 | + await gen_worker.deregister_worker() |
| 162 | + |
| 163 | + |
| 164 | +@pytest.mark.timeout(20) |
| 165 | +@pytest.mark.parametrize("config", storage_types, indirect=True) |
| 166 | +@pytest.mark.threadleak(enabled=False) |
| 167 | +@pytest.mark.asyncio(scope="module") |
| 168 | +async def test_cluster_worker(cluster_manager, storage_client, config): |
| 169 | + |
| 170 | + async def wait_for_worker_events(expected_new_event_num, |
| 171 | + expected_dead_event_num): |
| 172 | + new_worker_ids = [] |
| 173 | + dead_workers_ids = [] |
| 174 | + while len(new_worker_ids) < expected_new_event_num or len( |
| 175 | + dead_workers_ids) < expected_dead_event_num: |
| 176 | + try: |
| 177 | + worker_events = await asyncio.wait_for( |
| 178 | + cluster_manager.get_worker_events(), timeout=2) |
| 179 | + new_workers = [ |
| 180 | + worker_info.worker_id |
| 181 | + for worker_info, event_type in worker_events |
| 182 | + if event_type == WatchEventType.SET |
| 183 | + ] |
| 184 | + dead_workers = [ |
| 185 | + worker_info.worker_id |
| 186 | + for worker_info, event_type in worker_events |
| 187 | + if event_type == WatchEventType.DELETE |
| 188 | + ] |
| 189 | + print(f"Worker events: {worker_events} {time.time()}") |
| 190 | + new_worker_ids += new_workers |
| 191 | + dead_workers_ids += dead_workers |
| 192 | + except asyncio.TimeoutError: |
| 193 | + pass |
| 194 | + return new_worker_ids, dead_workers_ids |
| 195 | + |
| 196 | + try: |
| 197 | + await cluster_manager.start() |
| 198 | + await cluster_manager.watch_workers() |
| 199 | + ctx_worker = ClusterWorker(ServerRole.CONTEXT, "127.0.0.1", 8001, |
| 200 | + config, storage_client) |
| 201 | + gen_worker = ClusterWorker(ServerRole.GENERATION, "127.0.0.1", 8002, |
| 202 | + config, storage_client) |
| 203 | + |
| 204 | + keep_heartbeat = True |
| 205 | + assert await ctx_worker.register_worker(validator=lambda: keep_heartbeat |
| 206 | + ) |
| 207 | + assert await gen_worker.register_worker(validator=lambda: keep_heartbeat |
| 208 | + ) |
| 209 | + worker_ids = set([ctx_worker.worker_id, gen_worker.worker_id]) |
| 210 | + new_worker_ids, dead_workers_ids = await wait_for_worker_events(2, 0) |
| 211 | + assert set(new_worker_ids) == worker_ids |
| 212 | + assert len(dead_workers_ids) == 0 |
| 213 | + assert await cluster_manager.is_ready() == True |
| 214 | + |
| 215 | + await asyncio.sleep(config.inactive_timeout + 1) |
| 216 | + assert await cluster_manager.is_ready() == True |
| 217 | + |
| 218 | + # stop heartbeat, then we should see two workers deleted |
| 219 | + keep_heartbeat = False |
| 220 | + new_worker_ids, dead_workers_ids = await wait_for_worker_events(0, 2) |
| 221 | + assert len(new_worker_ids) == 0 |
| 222 | + assert len(dead_workers_ids) == 2 |
| 223 | + assert set(dead_workers_ids) == worker_ids |
| 224 | + assert await cluster_manager.is_ready() == False |
| 225 | + finally: |
| 226 | + await ctx_worker.deregister_worker() |
| 227 | + await gen_worker.deregister_worker() |
0 commit comments