SDK Reference
Reference for every public symbol exported from paygent. Anything not on this page (modules with leading _, internal helpers) is implementation detail and may change between minor versions.
from paygent import (
Paygent,
paygent_context,
paygent_track,
PaygentLimitExceeded,
PaygentBackendUnreachable,
PaygentAuthInvalid,
PlanConfig,
ModelCostRate,
ModelLimitConfig,
GuardResult,
UsageEvent,
CurrentUsage,
ModelUsage,
UserState,
BudgetRemaining,
MaxTokensAdvice,
BillingPeriod,
)
from paygent.integrations import LangChainCallback, CrewAICallback
Everything below is grouped by purpose. Methods beginning with pg. are bound to the Paygent instance returned by Paygent.init(). Methods beginning with Paygent. are class-level.
Lifecycle
Setup, teardown, and event-queue control. Every app calls these.
Paygent.init(...)
@classmethod
def init(
cls,
api_key: str | None = None,
base_url: str | None = None,
db_path: str | None = None,
flush_interval: float = 5.0,
max_batch_size: int = 100,
max_queue_size: int = 10_000,
raise_on_hard_gate: bool = True,
auto_instrument: bool = True,
registry: ProviderRegistry | None = None,
refresh_interval: float = 60.0,
strict_backend: bool = False,
) -> Paygent
Creates and initializes the Paygent singleton. If a previous instance exists, it's shut down first.
| Parameter | Default | Description |
|---|---|---|
api_key |
None |
Backend API key. None → local-only mode. Read from PAYGENT_API_KEY env var if you don't pass it. |
base_url |
https://api.paygent.to |
Backend URL. Override via PAYGENT_BASE_URL env var. |
db_path |
~/.paygent/local.db |
Local SQLite path. |
flush_interval |
5.0 |
Seconds between background event-queue flushes. |
max_batch_size |
100 |
Max events per flush batch. |
max_queue_size |
10_000 |
Max queue depth before events get dropped. |
raise_on_hard_gate |
True |
If False, hard-gated calls execute (callbacks still fire). |
auto_instrument |
True |
Whether to monkey-patch OpenAI / Anthropic at init. |
registry |
None |
Custom ProviderRegistry. Defaults to OpenAI + Anthropic. |
refresh_interval |
60.0 |
Seconds between background UserState refreshes from backend. |
strict_backend |
False |
If True, raise on init when backend is unreachable / API key invalid (instead of warning). |
Returns: the Paygent singleton.
pg = Paygent.init(api_key=os.environ["PAYGENT_API_KEY"])
Paygent.get_instance()
@classmethod
def get_instance(cls) -> Paygent | None
Returns the current singleton, or None if init() hasn't been called.
pg.instrument()
def instrument(self) -> list[str]
Manually patch LLM providers. Only needed if auto_instrument=False was passed to init(). Returns the list of patch keys applied.
pg.uninstrument()
def uninstrument(self) -> int
Remove all monkey-patches. Returns the number of patches restored.
pg.flush()
def flush(self) -> int
Force a synchronous flush of the event queue. Returns the number of events flushed.
pg.flush() # ensure backend is up to date before next read
pg.shutdown(timeout=10.0)
def shutdown(self, timeout: float = 10.0) -> None
Graceful shutdown. Restores patches, flushes events, syncs unsynced events to the backend, closes connections.
Called automatically via atexit, but you can call it explicitly in tests or short-lived scripts.
Properties
Read-only state on the Paygent instance.
| Property | Type | Description |
|---|---|---|
pg.api_key |
str | None |
Configured API key. |
pg.base_url |
str |
Backend URL. |
pg.is_local_mode |
bool |
True if no API key. |
pg.is_connected |
bool |
True if backend is currently reachable. |
pg.backend_reachable |
bool |
Result of the init-time health check. |
pg.is_initialized |
bool |
True after init() completes. |
pg.is_instrumented |
bool |
True if monkey-patches are currently in place. |
pg.pending_events |
int |
Events waiting to flush. |
pg.queue_stats |
dict |
Event queue diagnostics. |
Metering
Explicit per-call wrappers and the decorator. Use these when you'd rather not monkey-patch, or when the patcher can't see your call site.
pg.wrap(call, ...)
def wrap(
self,
call: Callable[[], Any],
*,
user_id: str,
model: str | None = None,
session_id: str | None = None,
metadata: dict[str, Any] | None = None,
provider: str | None = None,
estimated_input_tokens: int | None = None,
estimated_max_tokens: int | None = None,
) -> Any
Explicit per-call metering. Pass a zero-arg callable that performs the LLM call.
response = pg.wrap(
lambda: client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello"}],
),
user_id="user_123",
model="gpt-4o",
)
| Parameter | Description |
|---|---|
call |
Zero-arg callable performing the LLM call. |
user_id |
Required. End-user identifier. |
model |
Optional. Used for guard checks and cost calc. If None, extracted from the response. |
session_id |
Optional. Override the session ID for this call. |
metadata |
Optional. Attached to the resulting UsageEvent. |
provider |
"openai" or "anthropic". None → try all registered providers. |
estimated_input_tokens |
Used for the concurrent-safe reservation. |
estimated_max_tokens |
Output cap estimate. Falls back to plan_config.pre_call_buffer_tokens. |
Raises: PaygentLimitExceeded if a hard gate fires and raise_on_hard_gate=True.
Returns: the value returned by call().
pg.awrap(call, ...)
async def awrap(
self,
call: Awaitable[Any],
*,
user_id: str,
model: str | None = None,
session_id: str | None = None,
metadata: dict[str, Any] | None = None,
provider: str | None = None,
estimated_input_tokens: int | None = None,
estimated_max_tokens: int | None = None,
) -> Any
Async version of wrap(). Accepts a coroutine or awaitable.
response = await pg.awrap(
async_client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": "Hello"}],
),
user_id="user_123",
model="gpt-4o",
)
pg.track(...)
def track(
self,
user_id: str | None = None,
session_id: str | None = None,
plan: str | None = None,
metadata: dict[str, Any] | None = None,
*,
user_id_param: str | None = None,
) -> Callable
Decorator alternative to paygent_context. See paygent_track() below — the standalone function is identical except pg.track() is bound to the instance.
@pg.track(user_id="user_123")
def handle(query):
return client.chat.completions.create(...)
Guard checks
Pre-flight budget checks. None of these execute an LLM call — they answer "would the next call be allowed?"
pg.check_guard(user_id, model=None)
def check_guard(self, user_id: str, model: str | None = None) -> GuardResult
Run the guard check manually. Returns GuardResult(status="ok") if no session exists. Useful for pre-flight checks.
pg.is_within_limit(user_id, model=None)
def is_within_limit(self, user_id: str, model: str | None = None) -> bool
Convenience wrapper. Returns True unless the next call would hard-gate. Soft-gate state counts as "within limit."
if pg.is_within_limit("user_123", model="gpt-4o"):
response = client.chat.completions.create(model="gpt-4o", ...)
pg.get_remaining_budget(user_id)
def get_remaining_budget(self, user_id: str) -> BudgetRemaining
Multi-dimensional snapshot of remaining budget. See BudgetRemaining below.
b = pg.get_remaining_budget("user_123")
print(b.period_spend_remaining, b.most_constrained)
pg.get_max_tokens(user_id, model, *, estimated_input_tokens=0, messages=None, hard_cap=4096)
def get_max_tokens(
self,
user_id: str,
model: str,
*,
estimated_input_tokens: int = 0,
messages: list | None = None,
hard_cap: int = 4096,
) -> MaxTokensAdvice
Recommends a max_tokens value that won't push the user over any hard gate.
advice = pg.get_max_tokens("user_123", model="gpt-4o", messages=[...])
if advice.max_tokens == 0:
return "Out of budget"
response = client.chat.completions.create(
model="gpt-4o",
messages=[...],
max_tokens=advice.max_tokens,
)
If messages is given and estimated_input_tokens=0, Paygent estimates input tokens internally (chars/4 heuristic).
User state & usage
Read-only accessors for cached state and usage counters. Use these for dashboards, debugging, or pre-call budget displays.
pg.get_usage(user_id)
def get_usage(self, user_id: str) -> CurrentUsage
Returns a deep-copy snapshot of the user's CurrentUsage. Auto-loads via the 5-step fallback chain (memory → backend → SQLite snapshot → expired snapshot → permissive defaults). Always returns a CurrentUsage object — never None.
Mutating the returned object does not affect the SDK's internal state.
usage = pg.get_usage("user_123")
print(usage.period_cost, usage.period_tokens_total)
pg.get_model_usage(user_id)
def get_model_usage(self, user_id: str) -> list[ModelUsage]
Per-model breakdown. Returns one ModelUsage per model that either has metered usage or has a configured limit.
for mu in pg.get_model_usage("user_123"):
print(mu.model, mu.tokens_used, "/", mu.tokens_limit, f"${mu.cost:.4f}")
pg.get_user_state(user_id)
def get_user_state(self, user_id: str) -> UserState
Returns the live cached UserState. Mutations affect subsequent guard checks — power-user accessor. For read-only use cases, prefer get_usage() / get_model_usage().
Auto-loads via the 5-step fallback chain. Always returns a UserState.
pg.refresh_user(user_id)
def refresh_user(self, user_id: str) -> UserState
Force-refresh user state from the backend. Useful after a plan change so the SDK picks up the new limits immediately rather than waiting for refresh_interval.
Falls back to existing cached state if the backend is unreachable.
# After updating subscription via the API:
pg.refresh_user("user_123")
pg.reset_user(user_id)
def reset_user(self, user_id: str) -> None
Drop the user from the in-memory cache. Next call for this user re-runs the 5-step fallback chain.
Callbacks
Register hooks that fire on gate decisions and metering events. Multiple callbacks per slot are allowed.
pg.on_soft_gate(callback)
def on_soft_gate(self, callback: Callable[[GuardResult], None]) -> None
Register a soft-gate callback. Multiple callbacks can be registered. They fire in registration order; one throwing doesn't stop the others.
pg.on_soft_gate(lambda r: print(f"⚠ {r.message}"))
pg.on_hard_gate(callback)
def on_hard_gate(self, callback: Callable[[GuardResult], None]) -> None
Register a hard-gate callback. Fires before PaygentLimitExceeded is raised.
pg.on_usage(callback)
def on_usage(self, callback: Callable[[UsageEvent], None]) -> None
Register a callback for every successfully-metered call. Fires across all metering paths (auto-instrument, wrap, framework callbacks).
pg.on_session_start(callback)
def on_session_start(self, callback: Callable[[UserState], None]) -> None
Register a callback that fires when a UserState is first loaded for a user.
Local-only mode
Configure plans and assign users in code, without a backend. Used in tests, development, and air-gapped environments.
pg.configure_plan(plan_name, config)
def configure_plan(self, plan_name: str, config: PlanConfig) -> None
Register a PlanConfig under a name for local-only mode. In connected mode, plans live in the backend; this method is a no-op for guard purposes.
pg.configure_plan("pro", PlanConfig(max_spend_per_period=49.00, ...))
pg.assign_plan(user_id, plan, billing_period=None)
def assign_plan(
self,
user_id: str,
plan: str,
billing_period: BillingPeriod | None = None,
) -> UserState
Assigns a plan to a user. The plan must have been registered via configure_plan. Used in local-only mode; connected mode uses the backend's subscription endpoint.
pg.assign_plan("user_123", plan="pro", billing_period=BillingPeriod(start=..., end=...))
If the plan is unknown, falls back to permissive defaults (no limits).
Returns: the UserState it just created or updated.
pg.start_session(user_id, plan="default", plan_config=None)
def start_session(
self,
user_id: str,
plan: str = "default",
plan_config: PlanConfig | None = None,
) -> UserState
Pre-warm the cache. Optional — the SDK auto-fetches on first use. Call this if you want to avoid the small latency hit on the first call.
In connected mode, tries the backend first (using plan_config as fallback if backend is unreachable). In local mode, uses plan_config directly.
Context managers & decorators
Set per-request user identity so the auto-patcher can attribute calls to the right user.
paygent_context(...)
@contextmanager
def paygent_context(
user_id: str,
session_id: str | None = None,
plan: str | None = None,
metadata: dict[str, Any] | None = None,
)
Set the user context for all LLM calls within the block. The patcher reads this on every call.
with paygent_context(user_id="user_123", metadata={"feature": "summarize"}):
response = client.chat.completions.create(...)
Works with sync code, async tasks, and threaded code (contextvars propagates correctly).
paygent_track(...)
def paygent_track(
user_id: str | None = None,
session_id: str | None = None,
plan: str | None = None,
metadata: dict[str, Any] | None = None,
*,
user_id_param: str | None = None,
) -> Callable
Decorator that wraps a function in paygent_context.
Static user:
from paygent import paygent_track
@paygent_track(user_id="user_123")
def handle(query):
return client.chat.completions.create(...)
Dynamic user from a function arg:
@paygent_track(user_id_param="user_id")
def handle(user_id: str, query: str):
return client.chat.completions.create(...)
pg.track(...) is the instance-bound version of this.
Exceptions
Errors and warnings the SDK can raise. Catch PaygentLimitExceeded in your request handlers; the two UserWarning subclasses are emitted at init.
class PaygentLimitExceeded(Exception)
Raised when a hard gate fires (and raise_on_hard_gate=True).
class PaygentLimitExceeded(Exception):
guard_result: GuardResult
try:
response = client.chat.completions.create(...)
except PaygentLimitExceeded as e:
e.guard_result.gate_reason # "total_spend", "model_limit:gpt-4o", ...
e.guard_result.message
e.guard_result.usage_pct
class PaygentBackendUnreachable(UserWarning)
Emitted at init() when the backend is unreachable. With strict_backend=True, raised instead.
class PaygentAuthInvalid(UserWarning)
Emitted at init() when the backend rejects the API key (401/403). With strict_backend=True, raised instead.
Models
Pydantic v2 models. All fields shown are public.
class PlanConfig
class PlanConfig(BaseModel):
max_spend_per_period: float = float("inf")
max_spend_per_session: float = float("inf")
soft_gate_at: float = 0.80
hard_gate_at: float = 1.00
model_limits: dict[str, ModelLimitConfig] = {}
cost_rates: dict[str, ModelCostRate] = {}
default_cost_rate: ModelCostRate | None = None
tool_costs: dict[str, float] = {}
default_tool_cost: float = 0.02
session_timeout_minutes: float = 30.0
pre_call_estimate: bool = False
pre_call_buffer_tokens: int = 4096
reservation_safety_factor: float = 1.2
See Configure your first plan for the field-by-field walkthrough.
class ModelCostRate
class ModelCostRate(BaseModel):
input: float # cost per 1K input tokens
output: float # cost per 1K output tokens
class ModelLimitConfig
class ModelLimitConfig(BaseModel):
max_tokens_per_period: int | None = None # None = unlimited
class GuardResult
class GuardResult(BaseModel):
status: Literal["ok", "soft_gate", "hard_gate"] = "ok"
gate_reason: str | None = None
usage_pct: float = 0.0
current_value: float = 0.0
limit_value: float = 0.0
message: str | None = None
gate_reason values: "total_spend", "session_spend", "model_limit:<model>".
class UsageEvent
class UsageEvent(BaseModel):
id: str # UUID, idempotency key
user_id: str
session_id: str | None = None
timestamp: datetime
model: str | None = None
input_tokens: int = 0
output_tokens: int = 0
total_tokens: int = 0
tool_calls: list[str] = []
cost_tokens: float = 0.0
cost_tools: float = 0.0
cost_total: float = 0.0
metadata: dict = {}
synced: bool = False
class CurrentUsage
class CurrentUsage(BaseModel):
period_cost: float = 0.0
session_cost: float = 0.0
session_id: str | None = None
session_started_at: datetime | None = None
period_tokens_total: int = 0
period_tokens_by_model: dict[str, int] = {}
period_cost_by_model: dict[str, float] = {}
reserved_cost: float = 0.0
reserved_tokens_by_model: dict[str, int] = {}
class ModelUsage
class ModelUsage(BaseModel):
model: str
tokens_used: int = 0
tokens_limit: int | None = None # None = unlimited
cost: float = 0.0
class UserState
class UserState(BaseModel):
user_id: str
paygent_user_id: str | None = None
plan: str
sdk_enabled: bool = True
plan_config: PlanConfig
current_usage: CurrentUsage
billing_period: BillingPeriod | None = None
last_refreshed: datetime | None = None
UserSession is a deprecated alias for UserState.
class BillingPeriod
class BillingPeriod(BaseModel):
start: datetime
end: datetime
class BudgetRemaining
class BudgetRemaining(BaseModel):
period_spend_remaining: float = float("inf")
session_spend_remaining: float = float("inf")
model_tokens_remaining: dict[str, int | None] = {}
most_constrained: Literal[
"period_spend", "session_spend", "model_tokens", "unbounded"
] = "unbounded"
model_tokens_remaining only includes models with a configured max_tokens_per_period. Unlimited per-model entries appear as None.
class MaxTokensAdvice
class MaxTokensAdvice(BaseModel):
max_tokens: int = 0
binding_limit: Literal[
"period_spend", "session_spend", "model_tokens", "unbounded", "blocked"
] = "unbounded"
period_spend_remaining: float = float("inf")
session_spend_remaining: float = float("inf")
model_tokens_remaining: int | None = None
max_tokens == 0 and binding_limit == "blocked" means don't make the call — the user is already over a hard gate.
Framework integrations
Drop-in callback classes for LangChain and CrewAI. See LangChain & CrewAI for usage walkthroughs.
class LangChainCallback
class LangChainCallback:
def __init__(
self,
pg: Paygent,
user_id: str,
session_id: str | None = None,
metadata: dict[str, Any] | None = None,
) -> None
LangChain callback handler. Passes the resulting UsageEvent through Paygent's normal flow.
from paygent.integrations import LangChainCallback
cb = LangChainCallback(pg, user_id="user_123")
chain.invoke({...}, config={"callbacks": [cb]})
De-dup: if auto_instrument=True AND a paygent_context is set, this callback skips itself (the patcher will meter).
class CrewAICallback
class CrewAICallback:
def __init__(
self,
pg: Paygent,
user_id: str,
session_id: str | None = None,
metadata: dict[str, Any] | None = None,
) -> None
def __call__(self, step_output: Any) -> None:
...
CrewAI step callback. Pass the instance to Crew(..., step_callback=cb).
from paygent.integrations import CrewAICallback
cb = CrewAICallback(pg, user_id="user_123")
crew = Crew(..., step_callback=cb)
crew.kickoff()
Same de-dup semantics as LangChainCallback.
Next steps
- Backend API Reference — what the SDK calls underneath
- Cost Guardrails — how it all fits together
- Troubleshooting — when it doesn't