Files
wanderer/lib/wanderer_app/security_audit/async_processor.ex

357 lines
9.4 KiB
Elixir

defmodule WandererApp.SecurityAudit.AsyncProcessor do
@moduledoc """
GenServer for asynchronous batch processing of security audit events.
This server buffers audit events in memory and periodically flushes them
to the database in batches for improved performance.
"""
use GenServer
require Logger
alias WandererApp.SecurityAudit
@default_batch_size 100
# 5 seconds
@default_flush_interval 5_000
@max_buffer_size 1_000
defstruct [
:batch_size,
:flush_interval,
:buffer,
:timer_ref,
:stats
]
# Client API
@doc """
Start the async processor.
"""
def start_link(opts \\ []) do
GenServer.start_link(__MODULE__, opts, name: __MODULE__)
end
@doc """
Log an event asynchronously.
"""
def log_event(audit_entry) do
GenServer.cast(__MODULE__, {:log_event, audit_entry})
end
@doc """
Force a flush of the buffer.
"""
def flush do
GenServer.call(__MODULE__, :flush)
end
@doc """
Get current processor statistics.
"""
def get_stats do
GenServer.call(__MODULE__, :get_stats)
end
# Server callbacks
@impl true
def init(opts) do
config = Application.get_env(:wanderer_app, WandererApp.SecurityAudit, [])
batch_size = Keyword.get(opts, :batch_size, config[:batch_size] || @default_batch_size)
flush_interval =
Keyword.get(opts, :flush_interval, config[:flush_interval] || @default_flush_interval)
state = %__MODULE__{
batch_size: batch_size,
flush_interval: flush_interval,
buffer: [],
timer_ref: nil,
stats: %{
events_processed: 0,
batches_flushed: 0,
errors: 0,
last_flush: nil
}
}
# Schedule first flush
state = schedule_flush(state)
{:ok, state}
end
@impl true
def handle_cast({:log_event, audit_entry}, state) do
# Add to buffer
buffer = [audit_entry | state.buffer]
buf_len = length(buffer)
# Update stats
stats = Map.update!(state.stats, :events_processed, &(&1 + 1))
# Check if we need to flush
cond do
buf_len >= state.batch_size ->
# Flush immediately if batch size reached
{:noreply, do_flush(%{state | buffer: buffer, stats: stats})}
buf_len >= @max_buffer_size ->
# Force flush if max buffer size reached
Logger.warning("Security audit buffer overflow, forcing flush",
buffer_size: buf_len,
max_size: @max_buffer_size
)
{:noreply, do_flush(%{state | buffer: buffer, stats: stats})}
true ->
# Just add to buffer
{:noreply, %{state | buffer: buffer, stats: stats}}
end
end
@impl true
def handle_call(:flush, _from, state) do
new_state = do_flush(state)
{:reply, :ok, new_state}
end
@impl true
def handle_call(:get_stats, _from, state) do
stats = Map.put(state.stats, :current_buffer_size, length(state.buffer))
{:reply, stats, state}
end
@impl true
def handle_info(:flush_timer, state) do
state =
if length(state.buffer) > 0 do
do_flush(state)
else
state
end
# Schedule next flush
state = schedule_flush(state)
{:noreply, state}
end
@impl true
def terminate(_reason, state) do
# Flush any remaining events on shutdown
if length(state.buffer) > 0 do
do_flush(state)
end
:ok
end
# Private functions
defp schedule_flush(state) do
# Cancel existing timer if any
if state.timer_ref do
Process.cancel_timer(state.timer_ref)
end
# Schedule new timer
timer_ref = Process.send_after(self(), :flush_timer, state.flush_interval)
%{state | timer_ref: timer_ref}
end
defp do_flush(state) when length(state.buffer) == 0 do
state
end
defp do_flush(state) do
# Take events to flush (reverse to maintain order)
events = Enum.reverse(state.buffer)
# Attempt to store events
case bulk_store_events(events) do
{:ok, count} ->
Logger.debug("Flushed #{count} security audit events")
# Update stats
stats =
state.stats
|> Map.update!(:batches_flushed, &(&1 + 1))
|> Map.put(:last_flush, DateTime.utc_now())
# Clear buffer
%{state | buffer: [], stats: stats}
{:partial, success_count, failed_events} ->
failed_count = length(failed_events)
Logger.warning(
"Partial flush: stored #{success_count}, failed #{failed_count} audit events",
success_count: success_count,
failed_count: failed_count,
buffer_size: length(state.buffer)
)
# Emit telemetry for monitoring
:telemetry.execute(
[:wanderer_app, :security_audit, :async_flush_partial],
%{success_count: success_count, failed_count: failed_count},
%{}
)
# Update stats - count partial flush as both success and error
stats =
state.stats
|> Map.update!(:batches_flushed, &(&1 + 1))
|> Map.update!(:errors, &(&1 + 1))
|> Map.put(:last_flush, DateTime.utc_now())
# Extract just the events from failed_events tuples
failed_only = Enum.map(failed_events, fn {event, _reason} -> event end)
remaining_buffer = Enum.reject(state.buffer, fn ev -> ev in failed_only end)
# Re-buffer failed events at the front, preserving newest-first ordering
# Reverse failed_only since flush reversed the buffer to oldest-first
new_buffer = Enum.reverse(failed_only) ++ remaining_buffer
buffer = handle_buffer_overflow(new_buffer, @max_buffer_size)
%{state | buffer: buffer, stats: stats}
{:error, failed_events} ->
failed_count = length(failed_events)
Logger.error("Failed to flush all #{failed_count} security audit events",
failed_count: failed_count,
buffer_size: length(state.buffer)
)
# Emit telemetry for monitoring
:telemetry.execute(
[:wanderer_app, :security_audit, :async_flush_failure],
%{count: 1, event_count: failed_count},
%{}
)
# Update error stats
stats = Map.update!(state.stats, :errors, &(&1 + 1))
# Extract just the events from failed_events tuples
failed_only = Enum.map(failed_events, fn {event, _reason} -> event end)
# Since ALL events failed, the new buffer should only contain the failed events
# Reverse to maintain newest-first ordering (flush reversed to oldest-first)
buffer = handle_buffer_overflow(Enum.reverse(failed_only), @max_buffer_size)
%{state | buffer: buffer, stats: stats}
end
end
defp bulk_store_events(events) do
# Process events in smaller chunks if necessary
events
# Ash bulk operations work better with smaller chunks
|> Enum.chunk_every(50)
|> Enum.reduce({0, []}, fn chunk, {total_success, all_failed} ->
case store_event_chunk(chunk) do
{:ok, chunk_count} ->
{total_success + chunk_count, all_failed}
{:partial, chunk_count, failed_events} ->
{total_success + chunk_count, all_failed ++ failed_events}
{:error, failed_events} ->
{total_success, all_failed ++ failed_events}
end
end)
|> then(fn {success_count, failed_events_list} ->
# Derive the final return shape based on results
cond do
failed_events_list == [] ->
{:ok, success_count}
success_count == 0 ->
{:error, failed_events_list}
true ->
{:partial, success_count, failed_events_list}
end
end)
end
defp handle_buffer_overflow(buffer, max_size) when length(buffer) > max_size do
dropped = length(buffer) - max_size
Logger.warning(
"Dropping #{dropped} oldest audit events due to buffer overflow",
buffer_size: length(buffer),
max_size: max_size
)
# Emit telemetry for dropped events
:telemetry.execute(
[:wanderer_app, :security_audit, :events_dropped],
%{count: dropped},
%{}
)
# Keep the newest events (take from the front since buffer is newest-first)
Enum.take(buffer, max_size)
end
defp handle_buffer_overflow(buffer, _max_size), do: buffer
defp store_event_chunk(events) do
# Process each event and partition results
{successes, failures} =
events
|> Enum.map(fn event ->
case SecurityAudit.do_store_audit_entry(event) do
:ok ->
{:ok, event}
{:error, reason} ->
Logger.error("Failed to store individual audit event",
error: inspect(reason),
event_type: Map.get(event, :event_type),
user_id: Map.get(event, :user_id)
)
{:error, {event, reason}}
end
end)
|> Enum.split_with(fn
{:ok, _} -> true
{:error, _} -> false
end)
successful_count = length(successes)
failed_count = length(failures)
# Extract failed events with reasons
failed_events = Enum.map(failures, fn {:error, event_reason} -> event_reason end)
# Log if some events failed (telemetry will be emitted at flush level)
if failed_count > 0 do
Logger.debug("Chunk processing: #{failed_count} of #{length(events)} events failed")
end
# Return richer result shape
cond do
successful_count == 0 ->
{:error, failed_events}
failed_count > 0 ->
{:partial, successful_count, failed_events}
true ->
{:ok, successful_count}
end
end
end