mirror of
https://github.com/wanderer-industries/wanderer
synced 2026-01-01 20:40:37 +00:00
468 lines
13 KiB
Elixir
468 lines
13 KiB
Elixir
defmodule WandererApp.Map.MapPoolCrashIntegrationTest do
|
|
@moduledoc """
|
|
Integration tests for MapPool crash recovery.
|
|
|
|
These tests verify end-to-end crash recovery behavior including:
|
|
- MapPool GenServer crashes and restarts
|
|
- State recovery from ETS
|
|
- Registry and cache consistency after recovery
|
|
- Telemetry events during recovery
|
|
- Multi-pool scenarios
|
|
|
|
Note: Many tests are skipped as they require full map infrastructure
|
|
(database, Server.Impl, map data, etc.) to be set up.
|
|
"""
|
|
|
|
use WandererApp.IntegrationCase, async: false
|
|
|
|
import Mox
|
|
|
|
setup :verify_on_exit!
|
|
|
|
alias WandererApp.Map.{MapPool, MapPoolDynamicSupervisor, MapPoolState}
|
|
|
|
@cache :map_pool_cache
|
|
@registry :map_pool_registry
|
|
@unique_registry :unique_map_pool_registry
|
|
@ets_table :map_pool_state_table
|
|
|
|
setup do
|
|
# Clean up any existing test data
|
|
cleanup_test_data()
|
|
|
|
# Check if required infrastructure is running
|
|
supervisor_running? = Process.whereis(MapPoolDynamicSupervisor) != nil
|
|
|
|
ets_exists? =
|
|
try do
|
|
:ets.info(@ets_table) != :undefined
|
|
rescue
|
|
_ -> false
|
|
end
|
|
|
|
on_exit(fn ->
|
|
cleanup_test_data()
|
|
end)
|
|
|
|
{:ok, supervisor_running: supervisor_running?, ets_exists: ets_exists?}
|
|
end
|
|
|
|
defp cleanup_test_data do
|
|
# Clean up test caches
|
|
WandererApp.Cache.delete("started_maps")
|
|
Cachex.clear(@cache)
|
|
|
|
# Clean up ETS entries
|
|
if :ets.whereis(@ets_table) != :undefined do
|
|
:ets.match_delete(@ets_table, {:"$1", :"$2", :"$3"})
|
|
end
|
|
end
|
|
|
|
defp find_pool_pid(uuid) do
|
|
pool_name = Module.concat(MapPool, uuid)
|
|
|
|
case Registry.lookup(@unique_registry, pool_name) do
|
|
[{pid, _value}] -> {:ok, pid}
|
|
[] -> {:error, :not_found}
|
|
end
|
|
end
|
|
|
|
describe "End-to-end crash recovery" do
|
|
@tag :skip
|
|
@tag :integration
|
|
test "MapPool recovers all maps after abnormal crash" do
|
|
# This test would:
|
|
# 1. Start a MapPool with test maps via MapPoolDynamicSupervisor
|
|
# 2. Verify maps are running and state is in ETS
|
|
# 3. Simulate crash using GenServer.call(pool_pid, :error)
|
|
# 4. Wait for supervisor to restart the pool
|
|
# 5. Verify all maps are recovered
|
|
# 6. Verify Registry, Cache, and ETS are consistent
|
|
|
|
# Requires:
|
|
# - Test map data in database
|
|
# - Server.Impl.start_map to work with test data
|
|
# - Full supervision tree running
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "MapPool preserves ETS state on abnormal termination" do
|
|
# This test would:
|
|
# 1. Start a MapPool with maps
|
|
# 2. Force crash
|
|
# 3. Verify ETS state is preserved (not deleted)
|
|
# 4. Verify new pool instance recovers from ETS
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "MapPool cleans ETS state on graceful shutdown" do
|
|
# This test would:
|
|
# 1. Start a MapPool with maps
|
|
# 2. Gracefully stop the pool (GenServer.cast(pool_pid, :stop))
|
|
# 3. Verify ETS state is deleted
|
|
# 4. Verify new pool starts with empty state
|
|
|
|
:ok
|
|
end
|
|
end
|
|
|
|
describe "Multi-pool crash scenarios" do
|
|
@tag :skip
|
|
@tag :integration
|
|
test "multiple pools crash and recover independently" do
|
|
# This test would:
|
|
# 1. Start multiple MapPool instances with different maps
|
|
# 2. Crash one pool
|
|
# 3. Verify only that pool recovers, others unaffected
|
|
# 4. Verify no cross-pool state corruption
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "concurrent pool crashes don't corrupt recovery state" do
|
|
# This test would:
|
|
# 1. Start multiple pools
|
|
# 2. Crash multiple pools simultaneously
|
|
# 3. Verify all pools recover correctly
|
|
# 4. Verify no ETS corruption or race conditions
|
|
|
|
:ok
|
|
end
|
|
end
|
|
|
|
describe "State consistency after recovery" do
|
|
@tag :skip
|
|
@tag :integration
|
|
test "Registry state matches recovered state" do
|
|
# This test would verify that after recovery:
|
|
# - unique_registry has correct map_ids for pool UUID
|
|
# - map_pool_registry has correct pool UUID entry
|
|
# - All map_ids in Registry match ETS state
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "Cache state matches recovered state" do
|
|
# This test would verify that after recovery:
|
|
# - map_pool_cache has correct map_id -> uuid mappings
|
|
# - started_maps cache includes all recovered maps
|
|
# - No orphaned cache entries
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "Map servers are actually running after recovery" do
|
|
# This test would:
|
|
# 1. Recover maps from crash
|
|
# 2. Verify each map's GenServer is actually running
|
|
# 3. Verify maps respond to requests
|
|
# 4. Verify map state is correct
|
|
|
|
:ok
|
|
end
|
|
end
|
|
|
|
describe "Recovery failure handling" do
|
|
@tag :skip
|
|
@tag :integration
|
|
test "recovery continues when individual map fails to start" do
|
|
# This test would:
|
|
# 1. Save state with maps [1, 2, 3] to ETS
|
|
# 2. Delete map 2 from database
|
|
# 3. Trigger recovery
|
|
# 4. Verify maps 1 and 3 recover successfully
|
|
# 5. Verify map 2 failure is logged and telemetry emitted
|
|
# 6. Verify pool continues with maps [1, 3]
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "recovery handles maps already running in different pool" do
|
|
# This test would simulate a race condition where:
|
|
# 1. Pool A crashes with map X
|
|
# 2. Before recovery, map X is started in Pool B
|
|
# 3. Pool A tries to recover map X
|
|
# 4. Verify conflict is detected and handled gracefully
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "recovery handles corrupted ETS state" do
|
|
# This test would:
|
|
# 1. Manually corrupt ETS state (invalid map IDs, wrong types, etc.)
|
|
# 2. Trigger recovery
|
|
# 3. Verify pool handles corruption gracefully
|
|
# 4. Verify telemetry emitted for failures
|
|
# 5. Verify pool continues with valid maps only
|
|
|
|
:ok
|
|
end
|
|
end
|
|
|
|
describe "Telemetry during recovery" do
|
|
test "telemetry events emitted in correct order", %{ets_exists: ets_exists?} do
|
|
if ets_exists? do
|
|
test_pid = self()
|
|
events = []
|
|
|
|
# Attach handlers for all recovery events
|
|
:telemetry.attach_many(
|
|
"test-recovery-events",
|
|
[
|
|
[:wanderer_app, :map_pool, :recovery, :start],
|
|
[:wanderer_app, :map_pool, :recovery, :complete],
|
|
[:wanderer_app, :map_pool, :recovery, :map_failed]
|
|
],
|
|
fn event, measurements, metadata, _config ->
|
|
send(test_pid, {:telemetry_event, event, measurements, metadata})
|
|
end,
|
|
nil
|
|
)
|
|
|
|
uuid = "test-pool-#{:rand.uniform(1_000_000)}"
|
|
|
|
# Simulate recovery sequence
|
|
# 1. Start event
|
|
:telemetry.execute(
|
|
[:wanderer_app, :map_pool, :recovery, :start],
|
|
%{recovered_map_count: 3, total_map_count: 3},
|
|
%{pool_uuid: uuid}
|
|
)
|
|
|
|
# 2. Complete event (in real recovery, this comes after all maps start)
|
|
:telemetry.execute(
|
|
[:wanderer_app, :map_pool, :recovery, :complete],
|
|
%{recovered_count: 3, failed_count: 0, duration_ms: 100},
|
|
%{pool_uuid: uuid}
|
|
)
|
|
|
|
# Verify we received both events
|
|
assert_receive {:telemetry_event, [:wanderer_app, :map_pool, :recovery, :start], _, _},
|
|
500
|
|
|
|
assert_receive {:telemetry_event, [:wanderer_app, :map_pool, :recovery, :complete], _, _},
|
|
500
|
|
|
|
:telemetry.detach("test-recovery-events")
|
|
else
|
|
:ok
|
|
end
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "telemetry includes accurate recovery statistics" do
|
|
# This test would verify that:
|
|
# - recovered_map_count matches actual recovered maps
|
|
# - failed_count matches actual failed maps
|
|
# - duration_ms is accurate
|
|
# - All metadata is correct
|
|
|
|
:ok
|
|
end
|
|
end
|
|
|
|
describe "Interaction with Reconciler" do
|
|
@tag :skip
|
|
@tag :integration
|
|
test "Reconciler doesn't interfere with crash recovery" do
|
|
# This test would:
|
|
# 1. Crash a pool with maps
|
|
# 2. Trigger both recovery and reconciliation
|
|
# 3. Verify they don't conflict
|
|
# 4. Verify final state is consistent
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "Reconciler detects failed recovery" do
|
|
# This test would:
|
|
# 1. Crash a pool with map X
|
|
# 2. Make recovery fail for map X
|
|
# 3. Run reconciler
|
|
# 4. Verify reconciler detects and potentially fixes the issue
|
|
|
|
:ok
|
|
end
|
|
end
|
|
|
|
describe "Edge cases" do
|
|
@tag :skip
|
|
@tag :integration
|
|
test "recovery during pool at capacity" do
|
|
# This test would:
|
|
# 1. Create pool with 19 maps
|
|
# 2. Crash pool while adding 20th map
|
|
# 3. Verify recovery handles capacity limit
|
|
# 4. Verify all maps start or overflow is handled
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "recovery with empty map list" do
|
|
# This test would:
|
|
# 1. Crash pool with empty map_ids
|
|
# 2. Verify recovery completes successfully
|
|
# 3. Verify pool starts with no maps
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "multiple crashes in quick succession" do
|
|
# This test would:
|
|
# 1. Crash pool
|
|
# 2. Immediately crash again during recovery
|
|
# 3. Verify supervisor's max_restarts is respected
|
|
# 4. Verify state remains consistent
|
|
|
|
:ok
|
|
end
|
|
end
|
|
|
|
describe "Performance under load" do
|
|
@tag :slow
|
|
@tag :skip
|
|
@tag :integration
|
|
test "recovery completes within 2 seconds for 20 maps" do
|
|
# This test would:
|
|
# 1. Create pool with 20 maps (pool limit)
|
|
# 2. Crash pool
|
|
# 3. Measure time to full recovery
|
|
# 4. Assert recovery < 2 seconds
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :slow
|
|
@tag :skip
|
|
@tag :integration
|
|
test "recovery doesn't block other pools" do
|
|
# This test would:
|
|
# 1. Start multiple pools
|
|
# 2. Crash one pool with many maps
|
|
# 3. Verify other pools continue to operate normally during recovery
|
|
# 4. Measure performance impact on healthy pools
|
|
|
|
:ok
|
|
end
|
|
end
|
|
|
|
describe "Supervisor interaction" do
|
|
test "ETS table survives individual pool crash", %{ets_exists: ets_exists?} do
|
|
if ets_exists? do
|
|
# Verify ETS table is owned by supervisor, not individual pools
|
|
table_info = :ets.info(@ets_table)
|
|
owner_pid = Keyword.get(table_info, :owner)
|
|
|
|
# Owner should be alive and be the supervisor or a system process
|
|
assert Process.alive?(owner_pid)
|
|
|
|
# Verify we can still access the table
|
|
uuid = "test-pool-#{:rand.uniform(1_000_000)}"
|
|
MapPoolState.save_pool_state(uuid, [1, 2, 3])
|
|
assert {:ok, [1, 2, 3]} = MapPoolState.get_pool_state(uuid)
|
|
else
|
|
:ok
|
|
end
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "supervisor restarts pool after crash" do
|
|
# This test would:
|
|
# 1. Start a pool via DynamicSupervisor
|
|
# 2. Crash the pool
|
|
# 3. Verify supervisor restarts it
|
|
# 4. Verify new PID is different from old PID
|
|
# 5. Verify pool is functional after restart
|
|
|
|
:ok
|
|
end
|
|
end
|
|
|
|
describe "Database consistency" do
|
|
@tag :skip
|
|
@tag :integration
|
|
test "recovered maps load latest state from database" do
|
|
# This test would:
|
|
# 1. Start maps with initial state
|
|
# 2. Modify map state in database
|
|
# 3. Crash pool
|
|
# 4. Verify recovered maps have latest database state
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "recovery uses MapState for map configuration" do
|
|
# This test would:
|
|
# 1. Verify recovery calls WandererApp.Map.get_map_state!/1
|
|
# 2. Verify state comes from database MapState table
|
|
# 3. Verify maps start with correct configuration
|
|
|
|
:ok
|
|
end
|
|
end
|
|
|
|
describe "Real-world scenarios" do
|
|
@tag :skip
|
|
@tag :integration
|
|
test "recovery after OOM crash" do
|
|
# This test would simulate recovery after out-of-memory crash:
|
|
# 1. Start pool with maps
|
|
# 2. Simulate OOM condition
|
|
# 3. Verify recovery completes successfully
|
|
# 4. Verify no memory leaks after recovery
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "recovery after network partition" do
|
|
# This test would simulate recovery after network issues:
|
|
# 1. Start maps with external dependencies
|
|
# 2. Simulate network partition
|
|
# 3. Crash pool
|
|
# 4. Verify recovery handles network errors gracefully
|
|
|
|
:ok
|
|
end
|
|
|
|
@tag :skip
|
|
@tag :integration
|
|
test "recovery preserves user sessions" do
|
|
# This test would:
|
|
# 1. Start maps with active user sessions
|
|
# 2. Crash pool
|
|
# 3. Verify users can continue after recovery
|
|
# 4. Verify presence tracking works after recovery
|
|
|
|
:ok
|
|
end
|
|
end
|
|
end
|