Usage Examples

All examples assume you have:

import pandas as pd
from framecheck import FrameCheck

column(…) – Core Behaviors

Ensures column exists

df = pd.DataFrame({'x': [1, 2, 3]})
schema = FrameCheck().column('x')
result = schema.validate(df)
FrameCheck validation passed.

Type enforcement

df = pd.DataFrame({'x': [1, 2, 'bad']})
schema = FrameCheck().column('x', type='int')
result = schema.validate(df)
FrameCheck validation errors:
- Column 'x' contains values that are not integer-like (e.g., decimals or strings): ['bad'].

in_set: Allowed values

df = pd.DataFrame({'status': ['new', 'active', 'archived']})
schema = FrameCheck().column('status', in_set=['new', 'active'])
result = schema.validate(df)
FrameCheck validation errors:
- Column 'status' contains values not in allowed set: ['archived'].

equals: All values must match

df = pd.DataFrame({'is_active': [True, False, True]})
schema = FrameCheck().column('is_active', type='bool', equals=True)
result = schema.validate(df)
FrameCheck validation errors:
- Column 'is_active' must equal True, but found values: [False].

not_null=True: Non-null required

df = pd.DataFrame({'is_active': [True, False, None]})
schema = FrameCheck().column('is_active', type='bool', not_null=True)
result = schema.validate(df)
FrameCheck validation errors:
- Column 'is_active' contains missing values.

regex: Pattern match

df = pd.DataFrame({'email': ['x@example.com', 'bademail']})
schema = FrameCheck().column('email', type='string', regex=r'.+@.+\..+')
result = schema.validate(df)
FrameCheck validation errors:
- Column 'email' has values not matching regex '.+@.+\..+': ['bademail'].

Range & Bound Checks

df = pd.DataFrame({
    'age': [25, 17, 101],
    'score': [0.9, 0.5, 1.2],
    'signup_date': ['2021-01-01', '2019-12-31', '2023-05-01'],
    'last_login': ['2020-01-01', '2026-01-01', '2023-06-15']
})

schema = (
    FrameCheck()
    .column('age', type='int', min=18, max=99)
    .column('score', type='float', min=0.0, max=1.0)
    .column('signup_date', type='datetime', after='2020-01-01', before='2025-01-01')
    .column('last_login', type='datetime', min='2020-01-01', max='2025-01-01')
)
result = schema.validate(df)
FrameCheck validation errors:
- Column 'age' has values less than 18.
- Column 'age' has values greater than 99.
- Column 'score' has values greater than 1.0.
- Column 'signup_date' violates 'after' constraint: 2020-01-01.
- Column 'last_login' violates 'max' constraint: 2025-01-01.

columns(…) and columns_are(…)

Multiple column validation

df = pd.DataFrame({
    'a': [0, 1, 2],
    'b': [1, 0, 3],
    'c': [1, 1, 1]
})

schema = FrameCheck().columns(['a', 'b'], type='int', in_set=[0, 1])
result = schema.validate(df)
FrameCheck validation errors:
- Column 'a' contains values not in allowed set: [2].
- Column 'b' contains values not in allowed set: [3].

Column order match

df = pd.DataFrame({'b': [1], 'a': [2]})
schema = FrameCheck().columns_are(['a', 'b'])
result = schema.validate(df)
FrameCheck validation errors:
Expected columns in order: ['a', 'b']
Found columns in order: ['b', 'a']

custom_check(…)

df = pd.DataFrame({
    'score': [0.2, 0.95, 0.6],
    'flagged': [False, False, True]
})

schema = (
    FrameCheck()
    .column('score', type='float')
    .column('flagged', type='bool')
    .custom_check(
        lambda row: row['score'] <= 0.9 or row['flagged'] is True,
        description="flagged must be True when score > 0.9"
    )
)
result = schema.validate(df)
FrameCheck validation errors:
flagged must be True when score > 0.9 (failed on 1 row(s))

Other Checks

empty()

df = pd.DataFrame({'x': [1, 2]})
schema = FrameCheck().empty()
result = schema.validate(df)
FrameCheck validation errors:
DataFrame is expected to be empty but contains rows.

not_empty()

df = pd.DataFrame(columns=['a', 'b'])
schema = FrameCheck().not_empty()
result = schema.validate(df)
FrameCheck validation errors:
DataFrame is unexpectedly empty.

only_defined_columns()

df = pd.DataFrame({'a': [1], 'b': [2], 'extra': [999]})
schema = FrameCheck().column('a').column('b').only_defined_columns()
result = schema.validate(df)
FrameCheck validation errors:
Unexpected columns in DataFrame: ['extra']

row_count()

df = pd.DataFrame({'x': [1, 2]})
schema = FrameCheck().row_count(min=5)
result = schema.validate(df)
FrameCheck validation errors:
DataFrame must have at least 5 rows (found 2).

unique(…)

df = pd.DataFrame({
    'user_id': [1, 2, 2],
    'email': ['a@example.com', 'b@example.com', 'b@example.com']
})
schema = FrameCheck().unique()
result = schema.validate(df)
FrameCheck validation errors:
Rows are not unique.

Unique based on columns

df = pd.DataFrame({
    'user_id': [1, 2, 2],
    'email': ['a@example.com', 'b@example.com', 'c@example.com']
})
schema = FrameCheck().unique(columns=['user_id'])
result = schema.validate(df)
FrameCheck validation errors:
Rows are not unique based on columns: ['user_id']

validate()

df = pd.DataFrame({
    'score': [0.1, 0.5, 1.2]  # 1.2 exceeds the max
})

schema = FrameCheck().column('score', type='float', max=1.0)
result = schema.validate(df)

if not result.is_valid:
    print(result.summary())
FrameCheck validation errors:
- Column 'score' has values greater than 1.0.

get_invalid_rows()

df = pd.DataFrame({
    'a': [1, 2, -1],
    'b': [10, 20, 30]
})

schema = FrameCheck().column('a', type='int', min=0)
result = schema.validate(df)

if not result.is_valid:
    invalid_df = result.get_invalid_rows(df)
    print(invalid_df)
   a   b
2 -1  30

This is useful when you want to log, inspect, or export failing rows for debugging or downstream review.

Validation Comparison

This section compares how the same validation logic is expressed using three tools:

  • FrameCheck (concise, purpose-built for DataFrames)

  • Pandera (powerful, flexible, but not optimized for logging or row capture)

  • Pydantic (designed for model schemas, not native to pandas)

Shared Setup

import pandas as pd
from datetime import datetime

df = pd.DataFrame({
    'transaction_id': ['TXN1001', 'TXN1002', 'TXN1003', 'NUM9999'],
    'user_id': [501, 502, -1, 504],
    'transaction_time': ['2024-04-15 08:23:11', '2024-04-15 08:45:22', '2024-04-15 09:01:37', '2024-04-17 12:01:42'],
    'model_score': [0.03, 0.92, 0.95, 0.0],
    'model_version': ['v2.1.0', 'v2.1.0', 'v2.1.0', 'v2.1.0'],
    'flagged_for_review': [False, True, False, False]
})

FrameCheck (19 lines)

from framecheck import FrameCheck

result = (
    FrameCheck()
    .column('transaction_id', type='string', regex=r'^TXN\d{4,}$')
    .column('user_id', type='int', min=1)
    .column('transaction_time', type='datetime', before='now')
    .column('model_score', type='float', min=0.0, max=1.0)
    .column('model_score', type='float', not_in_set=[0.0], warn_only=True)
    .column('model_version', type='string')
    .column('flagged_for_review', type='bool')
    .custom_check(
        lambda row: row['model_score'] <= 0.9 or row['flagged_for_review'] is True,
        "flagged_for_review must be True when model_score > 0.9"
    )
    .not_null()
    .not_empty()
    .only_defined_columns()
    .validate(df)
)

print(result.summary())
Validation FAILED
3 error(s), 1 warning(s)
Errors:
  - Column 'user_id' has values less than 1.
  - Column 'transaction_id' has values not matching regex '^TXN\d{4,}$'.
  - flagged_for_review must be True when model_score > 0.9 (failed on 1 row(s))
Warnings:
  - Column 'model_score' contains disallowed values: [0.0].

Pandera (with row capture added manually)

import pandera as pa
from pandera import Column, Check, DataFrameSchema

df['transaction_time'] = pd.to_datetime(df['transaction_time'])

schema = DataFrameSchema({
    "transaction_id": Column(str, Check.str_matches(r"^TXN\d{4,}$"), nullable=False),
    "user_id": Column(int, Check.ge(1), nullable=False),
    "transaction_time": Column(pa.Timestamp, Check(lambda s: s < datetime.now()), nullable=False),
    "model_score": Column(float, Check.in_range(0.0, 1.0), nullable=False),
    "model_version": Column(str, nullable=False),
    "flagged_for_review": Column(bool, nullable=False),
}, checks=[
    Check(
        lambda df: (df['model_score'] <= 0.9) | (df['flagged_for_review'] == True),
        element_wise=False,
        error="flagged_for_review must be True when model_score > 0.9"
    )
], strict=True)

if df.empty:
    raise pa.errors.SchemaError("DataFrame is unexpectedly empty")

if not df[df['model_score'] == 0.0].empty:
    print("Warning: model_score == 0.0 found")

try:
    validated_df = schema.validate(df)
except pa.errors.SchemaErrors as e:
    print("Pandera errors:")
    print(e.failure_cases[['column', 'failure_case', 'index']])
Warning: model_score == 0.0 found
---------------------------------------------------------------------------
SchemaError                               Traceback (most recent call last)
<ipython-input-25-d8d5f408d13b> in <cell line: 0>()
     26
     27 try:
---> 28     validated_df = schema.validate(df)
     29 except pa.errors.SchemaErrors as e:
     30     print("Pandera errors:")

13 frames
/usr/local/lib/python3.11/dist-packages/pandera/api/base/error_handler.py in collect_error(self, error_type, reason_code, schema_error, original_exc)
     52         """
     53         if not self._lazy:
---> 54             raise schema_error from original_exc
     55
     56         # delete data of validated object from SchemaError object to prevent

SchemaError: Column 'transaction_id' failed element-wise validator number 0: str_matches('^TXN\\d{4,}$') failure cases: NUM9999

Note

Only the first failure encountered during validation is raised by Pandera. In this case, transaction_id=’NUM9999’ violates the regex constraint and halts validation. Other issues, like user_id=-1, are not reported until the first error is resolved. This differs from FrameCheck, which collects all validation issues in a single pass.

Pydantic (manual row iteration)

from pydantic import BaseModel, field_validator, model_validator
from typing import ClassVar
import re, logging

logger = logging.getLogger()

class ModelOutput(BaseModel):
    transaction_id: str
    user_id: int
    transaction_time: datetime
    model_score: float
    model_version: str
    flagged_for_review: bool

    expected_columns: ClassVar[set] = {
        'transaction_id', 'user_id', 'transaction_time',
        'model_score', 'model_version', 'flagged_for_review'
    }

    @field_validator('transaction_id')
    @classmethod
    def validate_txn(cls, v):
        if not re.match(r'^TXN\d{4,}$', v):
            raise ValueError("transaction_id must match TXN format")
        return v

    @field_validator('user_id')
    @classmethod
    def validate_uid(cls, v):
        if v < 1:
            raise ValueError("user_id must be positive")
        return v

    @field_validator('transaction_time')
    @classmethod
    def validate_time(cls, v):
        if v > datetime.now():
            raise ValueError("transaction_time must be before now")
        return v

    @field_validator('model_score')
    @classmethod
    def validate_score(cls, v):
        if not (0.0 <= v <= 1.0):
            raise ValueError("model_score must be in [0,1]")
        if v == 0.0:
            logger.warning("model_score == 0.0 found")
        return v

    @model_validator(mode='after')
    def check_flagged(self):
        if self.model_score > 0.9 and not self.flagged_for_review:
            raise ValueError("flagged_for_review must be True when score > 0.9")
        return self

    @classmethod
    def validate_df(cls, df):
        errors = []
        if df.empty:
            errors.append("DataFrame is empty")
        if set(df.columns) != cls.expected_columns:
            errors.append("Unexpected columns")
        for idx, row in df.iterrows():
            try:
                cls.model_validate(row.to_dict())
            except Exception as e:
                errors.append(f"Row {idx}: {e}")
        return errors

errors = ModelOutput.validate_df(df)
if errors:
    print("Pydantic validation errors:")
    for e in errors:
        print(e)
WARNING:root:model_score == 0.0 found
Pydantic validation errors:
Row 2: 1 validation error for ModelOutput
__root__
  flagged_for_review must be True when score > 0.9 (type=value_error)
Row 3: 1 validation error for ModelOutput
transaction_id
  transaction_id must match TXN format (type=value_error)
Row 2: 1 validation error for ModelOutput
user_id
  user_id must be positive (type=value_error)

Serialization and Persistence

Saving/Loading Schemas

import pandas as pd
from framecheck import FrameCheck, register_check_function

# Create a validator
validator = (
   FrameCheck()
   .column('user_id', type='string', regex=r'^U\d+$')
   .column('age', type='int', min=18)
   .not_null()
)

# Save to a file
validator.save('user_schema.json')

# Later, load it back
loaded_validator = FrameCheck.load('user_schema.json')
result = loaded_validator.validate(df)

to_dict() and from_dict()

# Convert to a dictionary representation
schema_dict = validator.to_dict()
print(schema_dict)  # Dictionary with all check information

# Create a validator from a dictionary
restored = FrameCheck.from_dict(schema_dict)
result = restored.validate(df)

to_json() and from_json()

# Convert to a JSON string
schema_json = validator.to_json()
print(schema_json)  # JSON string with all check information

# Create a validator from JSON
restored = FrameCheck.from_json(schema_json)
result = restored.validate(df)

info()

# Get a dictionary with all checks
details = validator.info()
print(f"Number of column checks: {len(details['column_checks'])}")
print(f"Number of dataframe checks: {len(details['dataframe_checks'])}")

Registered Check Functions

from framecheck import FrameCheck, register_check_function

# Register a reusable check function
@register_check_function(name="custom_age_check")
def validate_age_vs_income(row):
    if row['age'] < 25 and row['income'] > 100000:
        return False  # Suspicious: very young with very high income
    return True

# Use the registered check by name
validator = (
    FrameCheck()
    .column('age', type='int')
    .column('income', type='float')
    .registered_check('custom_age_check', "Age/income relationship check")
)

result = validator.validate(df)