fix(core): fixed maps start/stop logic, added server downtime period support

This commit is contained in:
Dmitry Popov
2025-11-13 15:48:56 +01:00
parent 843a6d7320
commit 0d4fd6f214
8 changed files with 1032 additions and 59 deletions
+1 -1
View File
@@ -8,7 +8,7 @@ defmodule WandererApp.Character.TrackerPool do
:tracked_ids,
:uuid,
:characters,
server_online: true
server_online: false
]
@name __MODULE__
@@ -12,7 +12,7 @@ defmodule WandererApp.Character.TransactionsTracker.Impl do
total_balance: 0,
transactions: [],
retries: 5,
server_online: true,
server_online: false,
status: :started
]
@@ -75,7 +75,7 @@ defmodule WandererApp.Character.TransactionsTracker.Impl do
def handle_event(
:update_corp_wallets,
%{character: character} = state
%{character: character, server_online: true} = state
) do
Process.send_after(self(), :update_corp_wallets, @update_interval)
@@ -88,26 +88,26 @@ defmodule WandererApp.Character.TransactionsTracker.Impl do
:update_corp_wallets,
state
) do
Process.send_after(self(), :update_corp_wallets, :timer.seconds(15))
Process.send_after(self(), :update_corp_wallets, @update_interval)
state
end
def handle_event(
:check_wallets,
%{wallets: []} = state
%{character: character, wallets: wallets, server_online: true} = state
) do
Process.send_after(self(), :check_wallets, :timer.seconds(5))
Process.send_after(self(), :check_wallets, @update_interval)
state
end
def handle_event(
:check_wallets,
%{character: character, wallets: wallets} = state
) do
check_wallets(wallets, character)
state
end
def handle_event(
:check_wallets,
state
) do
Process.send_after(self(), :check_wallets, @update_interval)
state
+258 -33
View File
@@ -64,11 +64,21 @@ defmodule WandererApp.Map.MapPool do
def handle_continue({:start, map_ids}, state) do
Logger.info("#{@name} started")
map_ids
|> Enum.each(fn map_id ->
GenServer.cast(self(), {:start_map, map_id})
end)
# Start maps synchronously and accumulate state changes
new_state =
map_ids
|> Enum.reduce(state, fn map_id, current_state ->
case do_start_map(map_id, current_state) do
{:ok, updated_state} ->
updated_state
{:error, reason} ->
Logger.error("[Map Pool] Failed to start map #{map_id}: #{reason}")
current_state
end
end)
# Schedule periodic tasks
Process.send_after(self(), :backup_state, @backup_state_timeout)
Process.send_after(self(), :cleanup_systems, 15_000)
Process.send_after(self(), :cleanup_characters, @characters_cleanup_timeout)
@@ -77,28 +87,21 @@ defmodule WandererApp.Map.MapPool do
# Start message queue monitoring
Process.send_after(self(), :monitor_message_queue, :timer.seconds(30))
{:noreply, state}
{:noreply, new_state}
end
@impl true
def handle_cast(:stop, state), do: {:stop, :normal, state}
@impl true
def handle_cast({:start_map, map_id}, %{map_ids: map_ids, uuid: uuid} = state) do
if map_id not in map_ids do
Registry.update_value(@unique_registry, Module.concat(__MODULE__, uuid), fn r_map_ids ->
[map_id | r_map_ids]
end)
def handle_cast({:start_map, map_id}, state) do
case do_start_map(map_id, state) do
{:ok, new_state} ->
{:noreply, new_state}
Cachex.put(@cache, map_id, uuid)
map_id
|> WandererApp.Map.get_map_state!()
|> Server.Impl.start_map()
{:noreply, %{state | map_ids: [map_id | map_ids]}}
else
{:noreply, state}
{:error, _reason} ->
# Error already logged in do_start_map
{:noreply, state}
end
end
@@ -119,6 +122,215 @@ defmodule WandererApp.Map.MapPool do
{:noreply, %{state | map_ids: map_ids |> Enum.reject(fn id -> id == map_id end)}}
end
@impl true
def handle_call(
{:stop_map, map_id},
_from,
state
) do
case do_stop_map(map_id, state) do
{:ok, new_state} ->
{:reply, :ok, new_state}
{:error, reason} ->
{:reply, {:error, reason}, state}
end
end
defp do_start_map(map_id, %{map_ids: map_ids, uuid: uuid} = state) do
if map_id in map_ids do
# Map already started
{:ok, state}
else
# Track what operations succeeded for potential rollback
completed_operations = []
try do
# Step 1: Update Registry (most critical, do first)
registry_result =
Registry.update_value(@unique_registry, Module.concat(__MODULE__, uuid), fn r_map_ids ->
[map_id | r_map_ids]
end)
completed_operations = [:registry | completed_operations]
case registry_result do
{new_value, _old_value} when is_list(new_value) ->
:ok
:error ->
raise "Failed to update registry for pool #{uuid}"
end
# Step 2: Add to cache
case Cachex.put(@cache, map_id, uuid) do
{:ok, _} ->
completed_operations = [:cache | completed_operations]
{:error, reason} ->
raise "Failed to add to cache: #{inspect(reason)}"
end
# Step 3: Start the map server
map_id
|> WandererApp.Map.get_map_state!()
|> Server.Impl.start_map()
completed_operations = [:map_server | completed_operations]
# Step 4: Update GenServer state (last, as this is in-memory and fast)
new_state = %{state | map_ids: [map_id | map_ids]}
Logger.debug("[Map Pool] Successfully started map #{map_id} in pool #{uuid}")
{:ok, new_state}
rescue
e ->
Logger.error("""
[Map Pool] Failed to start map #{map_id} (completed: #{inspect(completed_operations)}): #{Exception.message(e)}
#{Exception.format_stacktrace(__STACKTRACE__)}
""")
# Attempt rollback of completed operations
rollback_start_map_operations(map_id, uuid, completed_operations)
{:error, Exception.message(e)}
end
end
end
defp rollback_start_map_operations(map_id, uuid, completed_operations) do
Logger.warning("[Map Pool] Attempting to rollback start_map operations for #{map_id}")
# Rollback in reverse order
if :map_server in completed_operations do
Logger.debug("[Map Pool] Rollback: Stopping map server for #{map_id}")
try do
Server.Impl.stop_map(map_id)
rescue
e ->
Logger.error("[Map Pool] Rollback failed to stop map server: #{Exception.message(e)}")
end
end
if :cache in completed_operations do
Logger.debug("[Map Pool] Rollback: Removing #{map_id} from cache")
case Cachex.del(@cache, map_id) do
{:ok, _} ->
:ok
{:error, reason} ->
Logger.error("[Map Pool] Rollback failed for cache: #{inspect(reason)}")
end
end
if :registry in completed_operations do
Logger.debug("[Map Pool] Rollback: Removing #{map_id} from registry")
try do
Registry.update_value(@unique_registry, Module.concat(__MODULE__, uuid), fn r_map_ids ->
r_map_ids |> Enum.reject(fn id -> id == map_id end)
end)
rescue
e ->
Logger.error("[Map Pool] Rollback failed for registry: #{Exception.message(e)}")
end
end
end
defp do_stop_map(map_id, %{map_ids: map_ids, uuid: uuid} = state) do
# Track what operations succeeded for potential rollback
completed_operations = []
try do
# Step 1: Update Registry (most critical, do first)
registry_result =
Registry.update_value(@unique_registry, Module.concat(__MODULE__, uuid), fn r_map_ids ->
r_map_ids |> Enum.reject(fn id -> id == map_id end)
end)
completed_operations = [:registry | completed_operations]
case registry_result do
{new_value, _old_value} when is_list(new_value) ->
:ok
:error ->
raise "Failed to update registry for pool #{uuid}"
end
# Step 2: Delete from cache
case Cachex.del(@cache, map_id) do
{:ok, _} ->
completed_operations = [:cache | completed_operations]
{:error, reason} ->
raise "Failed to delete from cache: #{inspect(reason)}"
end
# Step 3: Stop the map server (clean up all map resources)
map_id
|> Server.Impl.stop_map()
completed_operations = [:map_server | completed_operations]
# Step 4: Update GenServer state (last, as this is in-memory and fast)
new_state = %{state | map_ids: map_ids |> Enum.reject(fn id -> id == map_id end)}
Logger.debug("[Map Pool] Successfully stopped map #{map_id} from pool #{uuid}")
{:ok, new_state}
rescue
e ->
Logger.error("""
[Map Pool] Failed to stop map #{map_id} (completed: #{inspect(completed_operations)}): #{Exception.message(e)}
#{Exception.format_stacktrace(__STACKTRACE__)}
""")
# Attempt rollback of completed operations
rollback_stop_map_operations(map_id, uuid, completed_operations)
{:error, Exception.message(e)}
end
end
defp rollback_stop_map_operations(map_id, uuid, completed_operations) do
Logger.warning("[Map Pool] Attempting to rollback stop_map operations for #{map_id}")
# Rollback in reverse order
if :cache in completed_operations do
Logger.debug("[Map Pool] Rollback: Re-adding #{map_id} to cache")
case Cachex.put(@cache, map_id, uuid) do
{:ok, _} ->
:ok
{:error, reason} ->
Logger.error("[Map Pool] Rollback failed for cache: #{inspect(reason)}")
end
end
if :registry in completed_operations do
Logger.debug("[Map Pool] Rollback: Re-adding #{map_id} to registry")
try do
Registry.update_value(@unique_registry, Module.concat(__MODULE__, uuid), fn r_map_ids ->
if map_id in r_map_ids do
r_map_ids
else
[map_id | r_map_ids]
end
end)
rescue
e ->
Logger.error("[Map Pool] Rollback failed for registry: #{Exception.message(e)}")
end
end
# Note: We don't rollback map_server stop as Server.Impl.stop_map() is idempotent
# and the cleanup operations are safe to leave in a "stopped" state
end
@impl true
def handle_call(:error, _, state), do: {:stop, :error, :ok, state}
@@ -231,25 +443,38 @@ defmodule WandererApp.Map.MapPool do
Process.send_after(self(), :garbage_collect, @garbage_collection_interval)
try do
map_ids
|> Enum.each(fn map_id ->
# presence_character_ids =
# WandererApp.Cache.lookup!("map_#{map_id}:presence_character_ids", [])
# Process each map and accumulate state changes
new_state =
map_ids
|> Enum.reduce(state, fn map_id, current_state ->
presence_character_ids =
WandererApp.Cache.lookup!("map_#{map_id}:presence_character_ids", [])
# if presence_character_ids |> Enum.empty?() do
Logger.info(
"#{uuid}: No more characters present on: #{map_id}, shutting down map server..."
)
if presence_character_ids |> Enum.empty?() do
Logger.info(
"#{uuid}: No more characters present on: #{map_id}, shutting down map server..."
)
GenServer.cast(self(), {:stop_map, map_id})
# end
end)
case do_stop_map(map_id, current_state) do
{:ok, updated_state} ->
Logger.debug("#{uuid}: Successfully stopped map #{map_id}")
updated_state
{:error, reason} ->
Logger.error("#{uuid}: Failed to stop map #{map_id}: #{reason}")
current_state
end
else
current_state
end
end)
{:noreply, new_state}
rescue
e ->
Logger.error(Exception.message(e))
Logger.error("#{uuid}: Garbage collection error: #{Exception.message(e)}")
{:noreply, state}
end
{:noreply, state}
end
@impl true
@@ -36,17 +36,78 @@ defmodule WandererApp.Map.MapPoolDynamicSupervisor do
end
def stop_map(map_id) do
{:ok, pool_uuid} = Cachex.get(@cache, map_id)
case Cachex.get(@cache, map_id) do
{:ok, nil} ->
# Cache miss - try to find the pool by scanning the registry
Logger.warning(
"Cache miss for map #{map_id}, scanning registry for pool containing this map"
)
case Registry.lookup(
@unique_registry,
Module.concat(WandererApp.Map.MapPool, pool_uuid)
) do
find_pool_by_scanning_registry(map_id)
{:ok, pool_uuid} ->
# Cache hit - use the pool_uuid to lookup the pool
case Registry.lookup(
@unique_registry,
Module.concat(WandererApp.Map.MapPool, pool_uuid)
) do
[] ->
Logger.warning(
"Pool with UUID #{pool_uuid} not found in registry for map #{map_id}, scanning registry"
)
find_pool_by_scanning_registry(map_id)
[{pool_pid, _}] ->
GenServer.cast(pool_pid, {:stop_map, map_id})
end
{:error, reason} ->
Logger.error("Failed to lookup map #{map_id} in cache: #{inspect(reason)}")
:ok
end
end
defp find_pool_by_scanning_registry(map_id) do
case Registry.lookup(@registry, WandererApp.Map.MapPool) do
[] ->
Logger.debug("No map pools found in registry for map #{map_id}")
:ok
[{pool_pid, _}] ->
GenServer.cast(pool_pid, {:stop_map, map_id})
pools ->
# Scan all pools to find the one containing this map_id
found_pool =
Enum.find_value(pools, fn {_pid, uuid} ->
case Registry.lookup(
@unique_registry,
Module.concat(WandererApp.Map.MapPool, uuid)
) do
[{pool_pid, map_ids}] ->
if map_id in map_ids do
{pool_pid, uuid}
else
nil
end
_ ->
nil
end
end)
case found_pool do
{pool_pid, pool_uuid} ->
Logger.info(
"Found map #{map_id} in pool #{pool_uuid} via registry scan, updating cache"
)
# Update the cache to fix the inconsistency
Cachex.put(@cache, map_id, pool_uuid)
GenServer.cast(pool_pid, {:stop_map, map_id})
nil ->
Logger.debug("Map #{map_id} not found in any pool registry")
:ok
end
end
end
+2 -1
View File
@@ -14,7 +14,8 @@ defmodule WandererApp.Map.MapPoolSupervisor do
children = [
{Registry, [keys: :unique, name: @unique_registry]},
{Registry, [keys: :duplicate, name: @registry]},
{WandererApp.Map.MapPoolDynamicSupervisor, []}
{WandererApp.Map.MapPoolDynamicSupervisor, []},
{WandererApp.Map.Reconciler, []}
]
Supervisor.init(children, strategy: :rest_for_one, max_restarts: 10)
+280
View File
@@ -0,0 +1,280 @@
defmodule WandererApp.Map.Reconciler do
@moduledoc """
Periodically reconciles map state across different stores (Cache, Registry, GenServer state)
to detect and fix inconsistencies that may prevent map servers from restarting.
"""
use GenServer
require Logger
@cache :map_pool_cache
@registry :map_pool_registry
@unique_registry :unique_map_pool_registry
@reconciliation_interval :timer.minutes(5)
def start_link(_opts) do
GenServer.start_link(__MODULE__, [], name: __MODULE__)
end
@impl true
def init(_opts) do
Logger.info("Starting Map Reconciler")
schedule_reconciliation()
{:ok, %{}}
end
@impl true
def handle_info(:reconcile, state) do
schedule_reconciliation()
try do
reconcile_state()
rescue
e ->
Logger.error("""
[Map Reconciler] reconciliation error: #{Exception.message(e)}
#{Exception.format_stacktrace(__STACKTRACE__)}
""")
end
{:noreply, state}
end
@doc """
Manually trigger a reconciliation (useful for testing or manual cleanup)
"""
def trigger_reconciliation do
GenServer.cast(__MODULE__, :reconcile_now)
end
@impl true
def handle_cast(:reconcile_now, state) do
try do
reconcile_state()
rescue
e ->
Logger.error("""
[Map Reconciler] manual reconciliation error: #{Exception.message(e)}
#{Exception.format_stacktrace(__STACKTRACE__)}
""")
end
{:noreply, state}
end
defp schedule_reconciliation do
Process.send_after(self(), :reconcile, @reconciliation_interval)
end
defp reconcile_state do
Logger.debug("[Map Reconciler] Starting state reconciliation")
# Get started_maps from cache
{:ok, started_maps} = WandererApp.Cache.lookup("started_maps", [])
# Get all maps from registries
registry_maps = get_all_registry_maps()
# Detect zombie maps (in started_maps but not in any registry)
zombie_maps = started_maps -- registry_maps
# Detect orphan maps (in registry but not in started_maps)
orphan_maps = registry_maps -- started_maps
# Detect cache inconsistencies (map_pool_cache pointing to wrong or non-existent pools)
cache_inconsistencies = find_cache_inconsistencies(registry_maps)
stats = %{
total_started_maps: length(started_maps),
total_registry_maps: length(registry_maps),
zombie_maps: length(zombie_maps),
orphan_maps: length(orphan_maps),
cache_inconsistencies: length(cache_inconsistencies)
}
Logger.info("[Map Reconciler] Reconciliation stats: #{inspect(stats)}")
# Emit telemetry
:telemetry.execute(
[:wanderer_app, :map, :reconciliation],
stats,
%{}
)
# Clean up zombie maps
cleanup_zombie_maps(zombie_maps)
# Fix orphan maps
fix_orphan_maps(orphan_maps)
# Fix cache inconsistencies
fix_cache_inconsistencies(cache_inconsistencies)
Logger.debug("[Map Reconciler] State reconciliation completed")
end
defp get_all_registry_maps do
case Registry.lookup(@registry, WandererApp.Map.MapPool) do
[] ->
[]
pools ->
pools
|> Enum.flat_map(fn {_pid, uuid} ->
case Registry.lookup(
@unique_registry,
Module.concat(WandererApp.Map.MapPool, uuid)
) do
[{_pool_pid, map_ids}] -> map_ids
_ -> []
end
end)
|> Enum.uniq()
end
end
defp find_cache_inconsistencies(registry_maps) do
registry_maps
|> Enum.filter(fn map_id ->
case Cachex.get(@cache, map_id) do
{:ok, nil} ->
# Map in registry but not in cache
true
{:ok, pool_uuid} ->
# Check if the pool_uuid actually exists in registry
case Registry.lookup(
@unique_registry,
Module.concat(WandererApp.Map.MapPool, pool_uuid)
) do
[] ->
# Cache points to non-existent pool
true
[{_pool_pid, map_ids}] ->
# Check if this map is actually in the pool's map_ids
map_id not in map_ids
_ ->
false
end
{:error, _} ->
true
end
end)
end
defp cleanup_zombie_maps([]), do: :ok
defp cleanup_zombie_maps(zombie_maps) do
Logger.warning("[Map Reconciler] Found #{length(zombie_maps)} zombie maps: #{inspect(zombie_maps)}")
Enum.each(zombie_maps, fn map_id ->
Logger.info("[Map Reconciler] Cleaning up zombie map: #{map_id}")
# Remove from started_maps cache
WandererApp.Cache.insert_or_update(
"started_maps",
[],
fn started_maps ->
started_maps |> Enum.reject(fn started_map_id -> started_map_id == map_id end)
end
)
# Clean up any stale map_pool_cache entries
Cachex.del(@cache, map_id)
# Clean up map-specific caches
WandererApp.Cache.delete("map_#{map_id}:started")
WandererApp.Cache.delete("map_characters-#{map_id}")
WandererApp.Map.CacheRTree.clear_tree("rtree_#{map_id}")
WandererApp.Map.delete_map_state(map_id)
:telemetry.execute(
[:wanderer_app, :map, :reconciliation, :zombie_cleanup],
%{count: 1},
%{map_id: map_id}
)
end)
end
defp fix_orphan_maps([]), do: :ok
defp fix_orphan_maps(orphan_maps) do
Logger.warning("[Map Reconciler] Found #{length(orphan_maps)} orphan maps: #{inspect(orphan_maps)}")
Enum.each(orphan_maps, fn map_id ->
Logger.info("[Map Reconciler] Fixing orphan map: #{map_id}")
# Add to started_maps cache
WandererApp.Cache.insert_or_update(
"started_maps",
[map_id],
fn existing ->
[map_id | existing] |> Enum.uniq()
end
)
:telemetry.execute(
[:wanderer_app, :map, :reconciliation, :orphan_fixed],
%{count: 1},
%{map_id: map_id}
)
end)
end
defp fix_cache_inconsistencies([]), do: :ok
defp fix_cache_inconsistencies(inconsistent_maps) do
Logger.warning(
"[Map Reconciler] Found #{length(inconsistent_maps)} cache inconsistencies: #{inspect(inconsistent_maps)}"
)
Enum.each(inconsistent_maps, fn map_id ->
Logger.info("[Map Reconciler] Fixing cache inconsistency for map: #{map_id}")
# Find the correct pool for this map
case find_pool_for_map(map_id) do
{:ok, pool_uuid} ->
Logger.info("[Map Reconciler] Updating cache: #{map_id} -> #{pool_uuid}")
Cachex.put(@cache, map_id, pool_uuid)
:telemetry.execute(
[:wanderer_app, :map, :reconciliation, :cache_fixed],
%{count: 1},
%{map_id: map_id, pool_uuid: pool_uuid}
)
:error ->
Logger.warning("[Map Reconciler] Could not find pool for map #{map_id}, removing from cache")
Cachex.del(@cache, map_id)
end
end)
end
defp find_pool_for_map(map_id) do
case Registry.lookup(@registry, WandererApp.Map.MapPool) do
[] ->
:error
pools ->
pools
|> Enum.find_value(:error, fn {_pid, uuid} ->
case Registry.lookup(
@unique_registry,
Module.concat(WandererApp.Map.MapPool, uuid)
) do
[{_pool_pid, map_ids}] ->
if map_id in map_ids do
{:ok, uuid}
else
nil
end
_ ->
nil
end
end)
end
end
end
@@ -11,7 +11,9 @@ defmodule WandererApp.Server.ServerStatusTracker do
:server_version,
:start_time,
:vip,
:retries
:retries,
:in_forced_downtime,
:downtime_notified
]
@retries_count 3
@@ -21,9 +23,17 @@ defmodule WandererApp.Server.ServerStatusTracker do
retries: @retries_count,
server_version: "0",
start_time: "0",
vip: true
vip: true,
in_forced_downtime: false,
downtime_notified: false
}
# EVE Online daily downtime period (UTC/GMT)
@downtime_start_hour 10
@downtime_start_minute 58
@downtime_end_hour 11
@downtime_end_minute 2
@refresh_interval :timer.minutes(1)
@logger Application.compile_env(:wanderer_app, :logger)
@@ -57,13 +67,51 @@ defmodule WandererApp.Server.ServerStatusTracker do
def handle_info(
:refresh_status,
%{
retries: retries
retries: retries,
in_forced_downtime: was_in_downtime
} = state
) do
Process.send_after(self(), :refresh_status, @refresh_interval)
Task.async(fn -> get_server_status(retries) end)
{:noreply, state}
in_downtime = in_forced_downtime?()
cond do
# Entering downtime period - broadcast offline status immediately
in_downtime and not was_in_downtime ->
@logger.info("#{__MODULE__} entering forced downtime period (10:58-11:02 GMT)")
downtime_status = %{
players: 0,
server_version: "downtime",
start_time: DateTime.utc_now() |> DateTime.to_iso8601(),
vip: true
}
Phoenix.PubSub.broadcast(
WandererApp.PubSub,
"server_status",
{:server_status, downtime_status}
)
{:noreply,
%{state | in_forced_downtime: true, downtime_notified: true}
|> Map.merge(downtime_status)}
# Currently in downtime - skip API call
in_downtime ->
{:noreply, state}
# Exiting downtime period - resume normal operations
not in_downtime and was_in_downtime ->
@logger.info("#{__MODULE__} exiting forced downtime period, resuming normal operations")
Task.async(fn -> get_server_status(retries) end)
{:noreply, %{state | in_forced_downtime: false, downtime_notified: false}}
# Normal operation
true ->
Task.async(fn -> get_server_status(retries) end)
{:noreply, state}
end
end
@impl true
@@ -155,4 +203,19 @@ defmodule WandererApp.Server.ServerStatusTracker do
vip: false
}
end
# Checks if the current UTC time falls within the forced downtime period (10:58-11:02 GMT).
defp in_forced_downtime? do
now = DateTime.utc_now()
current_hour = now.hour
current_minute = now.minute
# Convert times to minutes since midnight for easier comparison
current_time_minutes = current_hour * 60 + current_minute
downtime_start_minutes = @downtime_start_hour * 60 + @downtime_start_minute
downtime_end_minutes = @downtime_end_hour * 60 + @downtime_end_minute
current_time_minutes >= downtime_start_minutes and
current_time_minutes < downtime_end_minutes
end
end
+343
View File
@@ -0,0 +1,343 @@
defmodule WandererApp.Map.MapPoolTest do
use ExUnit.Case, async: false
alias WandererApp.Map.{MapPool, MapPoolDynamicSupervisor, Reconciler}
@cache :map_pool_cache
@registry :map_pool_registry
@unique_registry :unique_map_pool_registry
setup do
# Clean up any existing test data
cleanup_test_data()
# Check if required infrastructure is running
registries_running? =
try do
Registry.keys(@registry, self()) != :error
rescue
_ -> false
end
reconciler_running? = Process.whereis(Reconciler) != nil
on_exit(fn ->
cleanup_test_data()
end)
{:ok, registries_running: registries_running?, reconciler_running: reconciler_running?}
end
defp cleanup_test_data do
# Clean up test caches
WandererApp.Cache.delete("started_maps")
Cachex.clear(@cache)
end
describe "garbage collection with synchronous stop" do
@tag :skip
test "garbage collector successfully stops map with synchronous call" do
# This test would require setting up a full map pool with a test map
# Skipping for now as it requires more complex setup with actual map data
:ok
end
@tag :skip
test "garbage collector handles stop failures gracefully" do
# This test would verify error handling when stop fails
:ok
end
end
describe "cache lookup with registry fallback" do
test "stop_map handles cache miss by scanning registry", %{registries_running: registries_running?} do
if registries_running? do
# Setup: Create a map_id that's not in cache but will be found in registry scan
map_id = "test_map_#{:rand.uniform(1_000_000)}"
# Verify cache is empty for this map
assert {:ok, nil} = Cachex.get(@cache, map_id)
# Call stop_map - should handle gracefully with fallback
assert :ok = MapPoolDynamicSupervisor.stop_map(map_id)
else
# Skip test if registries not running
:ok
end
end
test "stop_map handles non-existent pool_uuid in registry", %{registries_running: registries_running?} do
if registries_running? do
map_id = "test_map_#{:rand.uniform(1_000_000)}"
fake_uuid = "fake_uuid_#{:rand.uniform(1_000_000)}"
# Put fake uuid in cache that doesn't exist in registry
Cachex.put(@cache, map_id, fake_uuid)
# Call stop_map - should handle gracefully with fallback
assert :ok = MapPoolDynamicSupervisor.stop_map(map_id)
else
:ok
end
end
test "stop_map updates cache when found via registry scan", %{registries_running: registries_running?} do
if registries_running? do
# This test would require a running pool with registered maps
# For now, we verify the fallback logic doesn't crash
map_id = "test_map_#{:rand.uniform(1_000_000)}"
assert :ok = MapPoolDynamicSupervisor.stop_map(map_id)
else
:ok
end
end
end
describe "state cleanup atomicity" do
@tag :skip
test "rollback occurs when registry update fails" do
# This would require mocking Registry.update_value to fail
# Skipping for now as it requires more complex mocking setup
:ok
end
@tag :skip
test "rollback occurs when cache delete fails" do
# This would require mocking Cachex.del to fail
:ok
end
@tag :skip
test "successful cleanup updates all three state stores" do
# This would verify Registry, Cache, and GenServer state are all updated
:ok
end
end
describe "Reconciler - zombie map detection and cleanup" do
test "reconciler detects zombie maps in started_maps cache", %{reconciler_running: reconciler_running?} do
if reconciler_running? do
# Setup: Add maps to started_maps that aren't in any registry
zombie_map_id = "zombie_map_#{:rand.uniform(1_000_000)}"
WandererApp.Cache.insert_or_update(
"started_maps",
[zombie_map_id],
fn existing -> [zombie_map_id | existing] |> Enum.uniq() end
)
# Get started_maps
{:ok, started_maps} = WandererApp.Cache.lookup("started_maps", [])
assert zombie_map_id in started_maps
# Trigger reconciliation
send(Reconciler, :reconcile)
# Give it time to process
Process.sleep(200)
# Verify zombie was cleaned up
{:ok, started_maps_after} = WandererApp.Cache.lookup("started_maps", [])
refute zombie_map_id in started_maps_after
else
:ok
end
end
test "reconciler cleans up zombie map caches", %{reconciler_running: reconciler_running?} do
if reconciler_running? do
zombie_map_id = "zombie_map_#{:rand.uniform(1_000_000)}"
# Setup zombie state
WandererApp.Cache.insert_or_update(
"started_maps",
[zombie_map_id],
fn existing -> [zombie_map_id | existing] |> Enum.uniq() end
)
WandererApp.Cache.insert("map_#{zombie_map_id}:started", true)
Cachex.put(@cache, zombie_map_id, "fake_uuid")
# Trigger reconciliation
send(Reconciler, :reconcile)
Process.sleep(200)
# Verify all caches cleaned
{:ok, started_maps} = WandererApp.Cache.lookup("started_maps", [])
refute zombie_map_id in started_maps
{:ok, cache_entry} = Cachex.get(@cache, zombie_map_id)
assert cache_entry == nil
else
:ok
end
end
end
describe "Reconciler - orphan map detection and fix" do
@tag :skip
test "reconciler detects orphan maps in registry" do
# This would require setting up a pool with maps in registry
# but not in started_maps cache
:ok
end
@tag :skip
test "reconciler adds orphan maps to started_maps cache" do
# This would verify orphan maps get added to the cache
:ok
end
end
describe "Reconciler - cache inconsistency detection and fix" do
test "reconciler detects map with missing cache entry", %{reconciler_running: reconciler_running?} do
if reconciler_running? do
# This test verifies the reconciler can detect when a map
# is in the registry but has no cache entry
# Since we can't easily set up a full pool, we test the detection logic
map_id = "test_map_#{:rand.uniform(1_000_000)}"
# Ensure no cache entry
Cachex.del(@cache, map_id)
# The reconciler would detect this if the map was in a registry
# For now, we just verify the logic doesn't crash
send(Reconciler, :reconcile)
Process.sleep(200)
# No assertions needed - just verifying no crashes
end
end
test "reconciler detects cache pointing to non-existent pool", %{reconciler_running: reconciler_running?} do
if reconciler_running? do
map_id = "test_map_#{:rand.uniform(1_000_000)}"
fake_uuid = "fake_uuid_#{:rand.uniform(1_000_000)}"
# Put fake uuid in cache
Cachex.put(@cache, map_id, fake_uuid)
# Trigger reconciliation
send(Reconciler, :reconcile)
Process.sleep(200)
# Cache entry should be removed since pool doesn't exist
{:ok, cache_entry} = Cachex.get(@cache, map_id)
assert cache_entry == nil
else
:ok
end
end
end
describe "Reconciler - stats and telemetry" do
test "reconciler emits telemetry events", %{reconciler_running: reconciler_running?} do
if reconciler_running? do
# Setup telemetry handler
test_pid = self()
:telemetry.attach(
"test-reconciliation",
[:wanderer_app, :map, :reconciliation],
fn _event, measurements, _metadata, _config ->
send(test_pid, {:telemetry, measurements})
end,
nil
)
# Trigger reconciliation
send(Reconciler, :reconcile)
Process.sleep(200)
# Should receive telemetry event
assert_receive {:telemetry, measurements}, 500
assert is_integer(measurements.total_started_maps)
assert is_integer(measurements.total_registry_maps)
assert is_integer(measurements.zombie_maps)
assert is_integer(measurements.orphan_maps)
assert is_integer(measurements.cache_inconsistencies)
# Cleanup
:telemetry.detach("test-reconciliation")
else
:ok
end
end
end
describe "Reconciler - manual trigger" do
test "trigger_reconciliation runs reconciliation immediately", %{reconciler_running: reconciler_running?} do
if reconciler_running? do
zombie_map_id = "zombie_map_#{:rand.uniform(1_000_000)}"
# Setup zombie state
WandererApp.Cache.insert_or_update(
"started_maps",
[zombie_map_id],
fn existing -> [zombie_map_id | existing] |> Enum.uniq() end
)
# Verify it exists
{:ok, started_maps_before} = WandererApp.Cache.lookup("started_maps", [])
assert zombie_map_id in started_maps_before
# Trigger manual reconciliation
Reconciler.trigger_reconciliation()
Process.sleep(200)
# Verify zombie was cleaned up
{:ok, started_maps_after} = WandererApp.Cache.lookup("started_maps", [])
refute zombie_map_id in started_maps_after
else
:ok
end
end
end
describe "edge cases and error handling" do
test "stop_map with cache error returns ok", %{registries_running: registries_running?} do
if registries_running? do
map_id = "test_map_#{:rand.uniform(1_000_000)}"
# Even if cache operations fail, should return :ok
assert :ok = MapPoolDynamicSupervisor.stop_map(map_id)
else
:ok
end
end
test "reconciler handles empty registries gracefully", %{reconciler_running: reconciler_running?} do
if reconciler_running? do
# Clear everything
cleanup_test_data()
# Should not crash even with empty data
send(Reconciler, :reconcile)
Process.sleep(200)
# No assertions - just verifying no crash
assert true
else
:ok
end
end
test "reconciler handles nil values in caches", %{reconciler_running: reconciler_running?} do
if reconciler_running? do
map_id = "test_map_#{:rand.uniform(1_000_000)}"
# Explicitly set nil
Cachex.put(@cache, map_id, nil)
# Should handle gracefully
send(Reconciler, :reconcile)
Process.sleep(200)
assert true
else
:ok
end
end
end
end