Other examples

This section has some further example parsers that you can study. There are also examples in the Tutorial and in Generating a parser.

SQL SELECT statement parser

This shows a very simplified parser for a SQL SELECT statement, using custom data structures, and the convenient keyword argument syntax for seq(), followed by Parser.combine_dict().

# A very limited parser for SQL SELECT statements,
# for demo purposes. Supports:
# 1. A simple list of columns (or number/string literals)
# 2. A simple table name
# 3. An optional where condition,
#    which has the form of 'A op B' where A and B are columns, strings or number,
#    and op is a comparison operator
#
# We demonstrate the use of `map` to create AST nodes with a single arg,
# and `seq` for AST nodes with more than one arg.

import enum
from dataclasses import dataclass
from typing import List, Optional, Union

from parsy import from_enum, regex, seq, string

# -- AST nodes:


class Operator(enum.Enum):
    EQ = "="
    LT = "<"
    GT = ">"
    LTE = "<="
    GTE = ">="


@dataclass
class Number:
    value: int


@dataclass
class String:
    value: str


@dataclass
class Field:
    name: str


@dataclass
class Table:
    name: str


ColumnExpression = Union[Field, String, Number]


@dataclass
class Comparison:
    left: ColumnExpression
    operator: Operator
    right: ColumnExpression


@dataclass
class Select:
    columns: List[ColumnExpression]
    table: Table
    where: Optional[Comparison]


# -- Parsers:

number_literal = regex(r"-?[0-9]+").map(int).map(Number)

# We don't support ' in strings or escaping for simplicity
string_literal = regex(r"'[^']*'").map(lambda s: String(s[1:-1]))

identifier = regex("[a-zA-Z][a-zA-Z0-9_]*")

field = identifier.map(Field)

table = identifier.map(Table)

space = regex(r"\s+")  # non-optional whitespace
padding = regex(r"\s*")  # optional whitespace

column_expr = field | string_literal | number_literal

operator = from_enum(Operator)

comparison = seq(
    left=column_expr << padding,
    operator=operator,
    right=padding >> column_expr,
).combine_dict(Comparison)

SELECT = string("SELECT")
FROM = string("FROM")
WHERE = string("WHERE")

# Here we demonstrate use of leading underscore to discard parts we don't want,
# which is more readable and convenient than `<<` and `>>` sometimes.
select = seq(
    _select=SELECT + space,
    columns=column_expr.sep_by(padding + string(",") + padding, min=1),
    _from=space + FROM + space,
    table=table,
    where=(space >> WHERE >> space >> comparison).optional(),
    _end=padding + string(";"),
).combine_dict(Select)


# Run these tests with pytest:


def test_select():
    assert select.parse("SELECT thing, stuff, 123, 'hello' FROM my_table WHERE id = 1;") == Select(
        columns=[
            Field("thing"),
            Field("stuff"),
            Number(123),
            String("hello"),
        ],
        table=Table("my_table"),
        where=Comparison(
            left=Field("id"),
            operator=Operator.EQ,
            right=Number(1),
        ),
    )


def test_optional_where():
    assert select.parse("SELECT 1 FROM x;") == Select(
        columns=[Number(1)],
        table=Table("x"),
        where=None,
    )

JSON parser

A full parser for JSON. (This will not be competitive in terms of performance with other implementations!)

This demonstrates the use of forward_declaration, needed due to the circular definition of json_value.

from parsy import forward_declaration, regex, seq, string

# Utilities
whitespace = regex(r"\s*")
lexeme = lambda p: p << whitespace

# Punctuation
lbrace = lexeme(string("{"))
rbrace = lexeme(string("}"))
lbrack = lexeme(string("["))
rbrack = lexeme(string("]"))
colon = lexeme(string(":"))
comma = lexeme(string(","))

# Primitives
true = lexeme(string("true")).result(True)
false = lexeme(string("false")).result(False)
null = lexeme(string("null")).result(None)
number = lexeme(regex(r"-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?")).map(float)
string_part = regex(r'[^"\\]+')
string_esc = string("\\") >> (
    string("\\")
    | string("/")
    | string('"')
    | string("b").result("\b")
    | string("f").result("\f")
    | string("n").result("\n")
    | string("r").result("\r")
    | string("t").result("\t")
    | regex(r"u[0-9a-fA-F]{4}").map(lambda s: chr(int(s[1:], 16)))
)
quoted = lexeme(string('"') >> (string_part | string_esc).many().concat() << string('"'))

# Data structures
json_value = forward_declaration()
object_pair = seq(quoted << colon, json_value).map(tuple)
json_object = lbrace >> object_pair.sep_by(comma).map(dict) << rbrace
array = lbrack >> json_value.sep_by(comma) << rbrack

# Everything
json_value.become(quoted | number | json_object | array | true | false | null)
json_doc = whitespace >> json_value


def test():
    assert (
        json_doc.parse(
            r"""
    {
        "int": 1,
        "string": "hello",
        "a list": [1, 2, 3],
        "escapes": "\n \u24D2",
        "nested": {"x": "y"},
        "other": [true, false, null]
    }
"""
        )
        == {
            "int": 1,
            "string": "hello",
            "a list": [1, 2, 3],
            "escapes": "\n ⓒ",
            "nested": {"x": "y"},
            "other": [True, False, None],
        }
    )


if __name__ == "__main__":
    from sys import stdin

    print(repr(json_doc.parse(stdin.read())))