Files
wanderer/test/integration/map/map_pool_crash_integration_test.exs
2025-11-24 11:33:08 +01:00

468 lines
13 KiB
Elixir

defmodule WandererApp.Map.MapPoolCrashIntegrationTest do
@moduledoc """
Integration tests for MapPool crash recovery.
These tests verify end-to-end crash recovery behavior including:
- MapPool GenServer crashes and restarts
- State recovery from ETS
- Registry and cache consistency after recovery
- Telemetry events during recovery
- Multi-pool scenarios
Note: Many tests are skipped as they require full map infrastructure
(database, Server.Impl, map data, etc.) to be set up.
"""
use WandererApp.IntegrationCase, async: false
import Mox
setup :verify_on_exit!
alias WandererApp.Map.{MapPool, MapPoolDynamicSupervisor, MapPoolState}
@cache :map_pool_cache
@registry :map_pool_registry
@unique_registry :unique_map_pool_registry
@ets_table :map_pool_state_table
setup do
# Clean up any existing test data
cleanup_test_data()
# Check if required infrastructure is running
supervisor_running? = Process.whereis(MapPoolDynamicSupervisor) != nil
ets_exists? =
try do
:ets.info(@ets_table) != :undefined
rescue
_ -> false
end
on_exit(fn ->
cleanup_test_data()
end)
{:ok, supervisor_running: supervisor_running?, ets_exists: ets_exists?}
end
defp cleanup_test_data do
# Clean up test caches
WandererApp.Cache.delete("started_maps")
Cachex.clear(@cache)
# Clean up ETS entries
if :ets.whereis(@ets_table) != :undefined do
:ets.match_delete(@ets_table, {:"$1", :"$2", :"$3"})
end
end
defp find_pool_pid(uuid) do
pool_name = Module.concat(MapPool, uuid)
case Registry.lookup(@unique_registry, pool_name) do
[{pid, _value}] -> {:ok, pid}
[] -> {:error, :not_found}
end
end
describe "End-to-end crash recovery" do
@tag :skip
@tag :integration
test "MapPool recovers all maps after abnormal crash" do
# This test would:
# 1. Start a MapPool with test maps via MapPoolDynamicSupervisor
# 2. Verify maps are running and state is in ETS
# 3. Simulate crash using GenServer.call(pool_pid, :error)
# 4. Wait for supervisor to restart the pool
# 5. Verify all maps are recovered
# 6. Verify Registry, Cache, and ETS are consistent
# Requires:
# - Test map data in database
# - Server.Impl.start_map to work with test data
# - Full supervision tree running
:ok
end
@tag :skip
@tag :integration
test "MapPool preserves ETS state on abnormal termination" do
# This test would:
# 1. Start a MapPool with maps
# 2. Force crash
# 3. Verify ETS state is preserved (not deleted)
# 4. Verify new pool instance recovers from ETS
:ok
end
@tag :skip
@tag :integration
test "MapPool cleans ETS state on graceful shutdown" do
# This test would:
# 1. Start a MapPool with maps
# 2. Gracefully stop the pool (GenServer.cast(pool_pid, :stop))
# 3. Verify ETS state is deleted
# 4. Verify new pool starts with empty state
:ok
end
end
describe "Multi-pool crash scenarios" do
@tag :skip
@tag :integration
test "multiple pools crash and recover independently" do
# This test would:
# 1. Start multiple MapPool instances with different maps
# 2. Crash one pool
# 3. Verify only that pool recovers, others unaffected
# 4. Verify no cross-pool state corruption
:ok
end
@tag :skip
@tag :integration
test "concurrent pool crashes don't corrupt recovery state" do
# This test would:
# 1. Start multiple pools
# 2. Crash multiple pools simultaneously
# 3. Verify all pools recover correctly
# 4. Verify no ETS corruption or race conditions
:ok
end
end
describe "State consistency after recovery" do
@tag :skip
@tag :integration
test "Registry state matches recovered state" do
# This test would verify that after recovery:
# - unique_registry has correct map_ids for pool UUID
# - map_pool_registry has correct pool UUID entry
# - All map_ids in Registry match ETS state
:ok
end
@tag :skip
@tag :integration
test "Cache state matches recovered state" do
# This test would verify that after recovery:
# - map_pool_cache has correct map_id -> uuid mappings
# - started_maps cache includes all recovered maps
# - No orphaned cache entries
:ok
end
@tag :skip
@tag :integration
test "Map servers are actually running after recovery" do
# This test would:
# 1. Recover maps from crash
# 2. Verify each map's GenServer is actually running
# 3. Verify maps respond to requests
# 4. Verify map state is correct
:ok
end
end
describe "Recovery failure handling" do
@tag :skip
@tag :integration
test "recovery continues when individual map fails to start" do
# This test would:
# 1. Save state with maps [1, 2, 3] to ETS
# 2. Delete map 2 from database
# 3. Trigger recovery
# 4. Verify maps 1 and 3 recover successfully
# 5. Verify map 2 failure is logged and telemetry emitted
# 6. Verify pool continues with maps [1, 3]
:ok
end
@tag :skip
@tag :integration
test "recovery handles maps already running in different pool" do
# This test would simulate a race condition where:
# 1. Pool A crashes with map X
# 2. Before recovery, map X is started in Pool B
# 3. Pool A tries to recover map X
# 4. Verify conflict is detected and handled gracefully
:ok
end
@tag :skip
@tag :integration
test "recovery handles corrupted ETS state" do
# This test would:
# 1. Manually corrupt ETS state (invalid map IDs, wrong types, etc.)
# 2. Trigger recovery
# 3. Verify pool handles corruption gracefully
# 4. Verify telemetry emitted for failures
# 5. Verify pool continues with valid maps only
:ok
end
end
describe "Telemetry during recovery" do
test "telemetry events emitted in correct order", %{ets_exists: ets_exists?} do
if ets_exists? do
test_pid = self()
events = []
# Attach handlers for all recovery events
:telemetry.attach_many(
"test-recovery-events",
[
[:wanderer_app, :map_pool, :recovery, :start],
[:wanderer_app, :map_pool, :recovery, :complete],
[:wanderer_app, :map_pool, :recovery, :map_failed]
],
fn event, measurements, metadata, _config ->
send(test_pid, {:telemetry_event, event, measurements, metadata})
end,
nil
)
uuid = "test-pool-#{:rand.uniform(1_000_000)}"
# Simulate recovery sequence
# 1. Start event
:telemetry.execute(
[:wanderer_app, :map_pool, :recovery, :start],
%{recovered_map_count: 3, total_map_count: 3},
%{pool_uuid: uuid}
)
# 2. Complete event (in real recovery, this comes after all maps start)
:telemetry.execute(
[:wanderer_app, :map_pool, :recovery, :complete],
%{recovered_count: 3, failed_count: 0, duration_ms: 100},
%{pool_uuid: uuid}
)
# Verify we received both events
assert_receive {:telemetry_event, [:wanderer_app, :map_pool, :recovery, :start], _, _},
500
assert_receive {:telemetry_event, [:wanderer_app, :map_pool, :recovery, :complete], _, _},
500
:telemetry.detach("test-recovery-events")
else
:ok
end
end
@tag :skip
@tag :integration
test "telemetry includes accurate recovery statistics" do
# This test would verify that:
# - recovered_map_count matches actual recovered maps
# - failed_count matches actual failed maps
# - duration_ms is accurate
# - All metadata is correct
:ok
end
end
describe "Interaction with Reconciler" do
@tag :skip
@tag :integration
test "Reconciler doesn't interfere with crash recovery" do
# This test would:
# 1. Crash a pool with maps
# 2. Trigger both recovery and reconciliation
# 3. Verify they don't conflict
# 4. Verify final state is consistent
:ok
end
@tag :skip
@tag :integration
test "Reconciler detects failed recovery" do
# This test would:
# 1. Crash a pool with map X
# 2. Make recovery fail for map X
# 3. Run reconciler
# 4. Verify reconciler detects and potentially fixes the issue
:ok
end
end
describe "Edge cases" do
@tag :skip
@tag :integration
test "recovery during pool at capacity" do
# This test would:
# 1. Create pool with 19 maps
# 2. Crash pool while adding 20th map
# 3. Verify recovery handles capacity limit
# 4. Verify all maps start or overflow is handled
:ok
end
@tag :skip
@tag :integration
test "recovery with empty map list" do
# This test would:
# 1. Crash pool with empty map_ids
# 2. Verify recovery completes successfully
# 3. Verify pool starts with no maps
:ok
end
@tag :skip
@tag :integration
test "multiple crashes in quick succession" do
# This test would:
# 1. Crash pool
# 2. Immediately crash again during recovery
# 3. Verify supervisor's max_restarts is respected
# 4. Verify state remains consistent
:ok
end
end
describe "Performance under load" do
@tag :slow
@tag :skip
@tag :integration
test "recovery completes within 2 seconds for 20 maps" do
# This test would:
# 1. Create pool with 20 maps (pool limit)
# 2. Crash pool
# 3. Measure time to full recovery
# 4. Assert recovery < 2 seconds
:ok
end
@tag :slow
@tag :skip
@tag :integration
test "recovery doesn't block other pools" do
# This test would:
# 1. Start multiple pools
# 2. Crash one pool with many maps
# 3. Verify other pools continue to operate normally during recovery
# 4. Measure performance impact on healthy pools
:ok
end
end
describe "Supervisor interaction" do
test "ETS table survives individual pool crash", %{ets_exists: ets_exists?} do
if ets_exists? do
# Verify ETS table is owned by supervisor, not individual pools
table_info = :ets.info(@ets_table)
owner_pid = Keyword.get(table_info, :owner)
# Owner should be alive and be the supervisor or a system process
assert Process.alive?(owner_pid)
# Verify we can still access the table
uuid = "test-pool-#{:rand.uniform(1_000_000)}"
MapPoolState.save_pool_state(uuid, [1, 2, 3])
assert {:ok, [1, 2, 3]} = MapPoolState.get_pool_state(uuid)
else
:ok
end
end
@tag :skip
@tag :integration
test "supervisor restarts pool after crash" do
# This test would:
# 1. Start a pool via DynamicSupervisor
# 2. Crash the pool
# 3. Verify supervisor restarts it
# 4. Verify new PID is different from old PID
# 5. Verify pool is functional after restart
:ok
end
end
describe "Database consistency" do
@tag :skip
@tag :integration
test "recovered maps load latest state from database" do
# This test would:
# 1. Start maps with initial state
# 2. Modify map state in database
# 3. Crash pool
# 4. Verify recovered maps have latest database state
:ok
end
@tag :skip
@tag :integration
test "recovery uses MapState for map configuration" do
# This test would:
# 1. Verify recovery calls WandererApp.Map.get_map_state!/1
# 2. Verify state comes from database MapState table
# 3. Verify maps start with correct configuration
:ok
end
end
describe "Real-world scenarios" do
@tag :skip
@tag :integration
test "recovery after OOM crash" do
# This test would simulate recovery after out-of-memory crash:
# 1. Start pool with maps
# 2. Simulate OOM condition
# 3. Verify recovery completes successfully
# 4. Verify no memory leaks after recovery
:ok
end
@tag :skip
@tag :integration
test "recovery after network partition" do
# This test would simulate recovery after network issues:
# 1. Start maps with external dependencies
# 2. Simulate network partition
# 3. Crash pool
# 4. Verify recovery handles network errors gracefully
:ok
end
@tag :skip
@tag :integration
test "recovery preserves user sessions" do
# This test would:
# 1. Start maps with active user sessions
# 2. Crash pool
# 3. Verify users can continue after recovery
# 4. Verify presence tracking works after recovery
:ok
end
end
end