Usage Examples

All examples assume you have:

import pandas as pd
from framecheck import FrameCheck

column(…) – Core Behaviors

Ensures column exists

df = pd.DataFrame({'x': [1, 2, 3]})
schema = FrameCheck().column('x')
result = schema.validate(df)

FrameCheck validation passed.

Type enforcement

df = pd.DataFrame({'x': [1, 2, 'bad']})
schema = FrameCheck().column('x', type='int')
result = schema.validate(df)

FrameCheck validation errors:
- Column 'x' contains values that are not integer-like (e.g., decimals or strings): ['bad'].

in_set: Allowed values

df = pd.DataFrame({'status': ['new', 'active', 'archived']})
schema = FrameCheck().column('status', in_set=['new', 'active'])
result = schema.validate(df)

FrameCheck validation errors:
- Column 'status' contains values not in allowed set: ['archived'].

equals: All values must match

df = pd.DataFrame({'is_active': [True, False, True]})
schema = FrameCheck().column('is_active', type='bool', equals=True)
result = schema.validate(df)

FrameCheck validation errors:
- Column 'is_active' must equal True, but found values: [False].

not_null=True: Non-null required

df = pd.DataFrame({'is_active': [True, False, None]})
schema = FrameCheck().column('is_active', type='bool', not_null=True)
result = schema.validate(df)

FrameCheck validation errors:
- Column 'is_active' contains missing values.

regex: Pattern match

df = pd.DataFrame({'email': ['x@example.com', 'bademail']})
schema = FrameCheck().column('email', type='string', regex=r'.+@.+\..+')
result = schema.validate(df)

FrameCheck validation errors:
- Column 'email' has values not matching regex '.+@.+\..+': ['bademail'].

Range & Bound Checks

df = pd.DataFrame({
    'age': [25, 17, 101],
    'score': [0.9, 0.5, 1.2],
    'signup_date': ['2021-01-01', '2019-12-31', '2023-05-01'],
    'last_login': ['2020-01-01', '2026-01-01', '2023-06-15']
})

schema = (
    FrameCheck()
    .column('age', type='int', min=18, max=99)
    .column('score', type='float', min=0.0, max=1.0)
    .column('signup_date', type='datetime', after='2020-01-01', before='2025-01-01')
    .column('last_login', type='datetime', min='2020-01-01', max='2025-01-01')
)
result = schema.validate(df)

FrameCheck validation errors:
- Column 'age' has values less than 18.
- Column 'age' has values greater than 99.
- Column 'score' has values greater than 1.0.
- Column 'signup_date' violates 'after' constraint: 2020-01-01.
- Column 'last_login' violates 'max' constraint: 2025-01-01.

columns(…) and columns_are(…)

Multiple column validation

df = pd.DataFrame({
    'a': [0, 1, 2],
    'b': [1, 0, 3],
    'c': [1, 1, 1]
})

schema = FrameCheck().columns(['a', 'b'], type='int', in_set=[0, 1])
result = schema.validate(df)

FrameCheck validation errors:
- Column 'a' contains values not in allowed set: [2].
- Column 'b' contains values not in allowed set: [3].

Column order match

df = pd.DataFrame({'b': [1], 'a': [2]})
schema = FrameCheck().columns_are(['a', 'b'])
result = schema.validate(df)

FrameCheck validation errors:
Expected columns in order: ['a', 'b']
Found columns in order: ['b', 'a']

custom_check(…)

df = pd.DataFrame({
    'score': [0.2, 0.95, 0.6],
    'flagged': [False, False, True]
})

schema = (
    FrameCheck()
    .column('score', type='float')
    .column('flagged', type='bool')
    .custom_check(
        lambda row: row['score'] <= 0.9 or row['flagged'] is True,
        description="flagged must be True when score > 0.9"
    )
)
result = schema.validate(df)

FrameCheck validation errors:
flagged must be True when score > 0.9 (failed on 1 row(s))

Other Checks

empty()

df = pd.DataFrame({'x': [1, 2]})
schema = FrameCheck().empty()
result = schema.validate(df)

FrameCheck validation errors:
DataFrame is expected to be empty but contains rows.

not_empty()

df = pd.DataFrame(columns=['a', 'b'])
schema = FrameCheck().not_empty()
result = schema.validate(df)

FrameCheck validation errors:
DataFrame is unexpectedly empty.

only_defined_columns()

df = pd.DataFrame({'a': [1], 'b': [2], 'extra': [999]})
schema = FrameCheck().column('a').column('b').only_defined_columns()
result = schema.validate(df)

FrameCheck validation errors:
Unexpected columns in DataFrame: ['extra']

row_count()

df = pd.DataFrame({'x': [1, 2]})
schema = FrameCheck().row_count(min=5)
result = schema.validate(df)

FrameCheck validation errors:
DataFrame must have at least 5 rows (found 2).

unique(…)

df = pd.DataFrame({
    'user_id': [1, 2, 2],
    'email': ['a@example.com', 'b@example.com', 'b@example.com']
})
schema = FrameCheck().unique()
result = schema.validate(df)

FrameCheck validation errors:
Rows are not unique.

Unique based on columns

df = pd.DataFrame({
    'user_id': [1, 2, 2],
    'email': ['a@example.com', 'b@example.com', 'c@example.com']
})
schema = FrameCheck().unique(columns=['user_id'])
result = schema.validate(df)

FrameCheck validation errors:
Rows are not unique based on columns: ['user_id']

validate()

df = pd.DataFrame({
    'score': [0.1, 0.5, 1.2]  # 1.2 exceeds the max
})

schema = FrameCheck().column('score', type='float', max=1.0)
result = schema.validate(df)

if not result.is_valid:
    print(result.summary())

FrameCheck validation errors:
- Column 'score' has values greater than 1.0.

get_invalid_rows()

df = pd.DataFrame({
    'a': [1, 2, -1],
    'b': [10, 20, 30]
})

schema = FrameCheck().column('a', type='int', min=0)
result = schema.validate(df)

if not result.is_valid:
    invalid_df = result.get_invalid_rows(df)
    print(invalid_df)

   a   b
2 -1  30

This is useful when you want to log, inspect, or export failing rows for debugging or downstream review.

Validation Comparison

This section compares how the same validation logic is expressed using three tools:

FrameCheck (concise, purpose-built for DataFrames)
Pandera (powerful, flexible, but not optimized for logging or row capture)
Pydantic (designed for model schemas, not native to pandas)

—

Shared Setup

import pandas as pd
from datetime import datetime

df = pd.DataFrame({
    'transaction_id': ['TXN1001', 'TXN1002', 'TXN1003', 'NUM9999'],
    'user_id': [501, 502, -1, 504],
    'transaction_time': ['2024-04-15 08:23:11', '2024-04-15 08:45:22', '2024-04-15 09:01:37', '2024-04-17 12:01:42'],
    'model_score': [0.03, 0.92, 0.95, 0.0],
    'model_version': ['v2.1.0', 'v2.1.0', 'v2.1.0', 'v2.1.0'],
    'flagged_for_review': [False, True, False, False]
})

—

FrameCheck (19 lines)

from framecheck import FrameCheck

result = (
    FrameCheck()
    .column('transaction_id', type='string', regex=r'^TXN\d{4,}$')
    .column('user_id', type='int', min=1)
    .column('transaction_time', type='datetime', before='now')
    .column('model_score', type='float', min=0.0, max=1.0)
    .column('model_score', type='float', not_in_set=[0.0], warn_only=True)
    .column('model_version', type='string')
    .column('flagged_for_review', type='bool')
    .custom_check(
        lambda row: row['model_score'] <= 0.9 or row['flagged_for_review'] is True,
        "flagged_for_review must be True when model_score > 0.9"
    )
    .not_null()
    .not_empty()
    .only_defined_columns()
    .validate(df)
)

print(result.summary())

Validation FAILED
3 error(s), 1 warning(s)
Errors:
  - Column 'user_id' has values less than 1.
  - Column 'transaction_id' has values not matching regex '^TXN\d{4,}$'.
  - flagged_for_review must be True when model_score > 0.9 (failed on 1 row(s))
Warnings:
  - Column 'model_score' contains disallowed values: [0.0].

—

Pandera (with row capture added manually)

import pandera as pa
from pandera import Column, Check, DataFrameSchema

df['transaction_time'] = pd.to_datetime(df['transaction_time'])

schema = DataFrameSchema({
    "transaction_id": Column(str, Check.str_matches(r"^TXN\d{4,}$"), nullable=False),
    "user_id": Column(int, Check.ge(1), nullable=False),
    "transaction_time": Column(pa.Timestamp, Check(lambda s: s < datetime.now()), nullable=False),
    "model_score": Column(float, Check.in_range(0.0, 1.0), nullable=False),
    "model_version": Column(str, nullable=False),
    "flagged_for_review": Column(bool, nullable=False),
}, checks=[
    Check(
        lambda df: (df['model_score'] <= 0.9) | (df['flagged_for_review'] == True),
        element_wise=False,
        error="flagged_for_review must be True when model_score > 0.9"
    )
], strict=True)

if df.empty:
    raise pa.errors.SchemaError("DataFrame is unexpectedly empty")

if not df[df['model_score'] == 0.0].empty:
    print("Warning: model_score == 0.0 found")

try:
    validated_df = schema.validate(df)
except pa.errors.SchemaErrors as e:
    print("Pandera errors:")
    print(e.failure_cases[['column', 'failure_case', 'index']])

Warning: model_score == 0.0 found
---------------------------------------------------------------------------
SchemaError                               Traceback (most recent call last)
<ipython-input-25-d8d5f408d13b> in <cell line: 0>()
     26
     27 try:
---> 28     validated_df = schema.validate(df)
     29 except pa.errors.SchemaErrors as e:
     30     print("Pandera errors:")

13 frames
/usr/local/lib/python3.11/dist-packages/pandera/api/base/error_handler.py in collect_error(self, error_type, reason_code, schema_error, original_exc)
     52         """
     53         if not self._lazy:
---> 54             raise schema_error from original_exc
     55
     56         # delete data of validated object from SchemaError object to prevent

SchemaError: Column 'transaction_id' failed element-wise validator number 0: str_matches('^TXN\\d{4,}$') failure cases: NUM9999

Note

Only the first failure encountered during validation is raised by Pandera. In this case, transaction_id=’NUM9999’ violates the regex constraint and halts validation. Other issues, like user_id=-1, are not reported until the first error is resolved. This differs from FrameCheck, which collects all validation issues in a single pass.

—

Pydantic (manual row iteration)

from pydantic import BaseModel, field_validator, model_validator
from typing import ClassVar
import re, logging

logger = logging.getLogger()

class ModelOutput(BaseModel):
    transaction_id: str
    user_id: int
    transaction_time: datetime
    model_score: float
    model_version: str
    flagged_for_review: bool

    expected_columns: ClassVar[set] = {
        'transaction_id', 'user_id', 'transaction_time',
        'model_score', 'model_version', 'flagged_for_review'
    }

    @field_validator('transaction_id')
    @classmethod
    def validate_txn(cls, v):
        if not re.match(r'^TXN\d{4,}$', v):
            raise ValueError("transaction_id must match TXN format")
        return v

    @field_validator('user_id')
    @classmethod
    def validate_uid(cls, v):
        if v < 1:
            raise ValueError("user_id must be positive")
        return v

    @field_validator('transaction_time')
    @classmethod
    def validate_time(cls, v):
        if v > datetime.now():
            raise ValueError("transaction_time must be before now")
        return v

    @field_validator('model_score')
    @classmethod
    def validate_score(cls, v):
        if not (0.0 <= v <= 1.0):
            raise ValueError("model_score must be in [0,1]")
        if v == 0.0:
            logger.warning("model_score == 0.0 found")
        return v

    @model_validator(mode='after')
    def check_flagged(self):
        if self.model_score > 0.9 and not self.flagged_for_review:
            raise ValueError("flagged_for_review must be True when score > 0.9")
        return self

    @classmethod
    def validate_df(cls, df):
        errors = []
        if df.empty:
            errors.append("DataFrame is empty")
        if set(df.columns) != cls.expected_columns:
            errors.append("Unexpected columns")
        for idx, row in df.iterrows():
            try:
                cls.model_validate(row.to_dict())
            except Exception as e:
                errors.append(f"Row {idx}: {e}")
        return errors

errors = ModelOutput.validate_df(df)
if errors:
    print("Pydantic validation errors:")
    for e in errors:
        print(e)

WARNING:root:model_score == 0.0 found
Pydantic validation errors:
Row 2: 1 validation error for ModelOutput
__root__
  flagged_for_review must be True when score > 0.9 (type=value_error)
Row 3: 1 validation error for ModelOutput
transaction_id
  transaction_id must match TXN format (type=value_error)
Row 2: 1 validation error for ModelOutput
user_id
  user_id must be positive (type=value_error)

Serialization and Persistence

Saving/Loading Schemas

import pandas as pd
from framecheck import FrameCheck, register_check_function

# Create a validator
validator = (
   FrameCheck()
   .column('user_id', type='string', regex=r'^U\d+$')
   .column('age', type='int', min=18)
   .not_null()
)

# Save to a file
validator.save('user_schema.json')

# Later, load it back
loaded_validator = FrameCheck.load('user_schema.json')
result = loaded_validator.validate(df)

to_dict() and from_dict()

# Convert to a dictionary representation
schema_dict = validator.to_dict()
print(schema_dict)  # Dictionary with all check information

# Create a validator from a dictionary
restored = FrameCheck.from_dict(schema_dict)
result = restored.validate(df)

to_json() and from_json()

# Convert to a JSON string
schema_json = validator.to_json()
print(schema_json)  # JSON string with all check information

# Create a validator from JSON
restored = FrameCheck.from_json(schema_json)
result = restored.validate(df)

info()

# Get a dictionary with all checks
details = validator.info()
print(f"Number of column checks: {len(details['column_checks'])}")
print(f"Number of dataframe checks: {len(details['dataframe_checks'])}")

Registered Check Functions

from framecheck import FrameCheck, register_check_function

# Register a reusable check function
@register_check_function(name="custom_age_check")
def validate_age_vs_income(row):
    if row['age'] < 25 and row['income'] > 100000:
        return False  # Suspicious: very young with very high income
    return True

# Use the registered check by name
validator = (
    FrameCheck()
    .column('age', type='int')
    .column('income', type='float')
    .registered_check('custom_age_check', "Age/income relationship check")
)

result = validator.validate(df)