Skip to content

[Data Liberation] EBNF processor #1981

@adamziel

Description

@adamziel

I started drafting an EBNF notation processor to potentially enable building parsers based on a grammar file, similarly to the MySQL parser (although that one is based on ANTLR grammar). I don't have any specific action for this issue, I only wanted to dump the code somewhere it would be searchable in the future. CC @JanJakes

<?php

class EBNF_Processor {
    private $ebnf;
    private $bytes_already_parsed = 0;
    private $last_error;

    private $rule_name;
    private $rules = [];

    private $state = self::STATE_READY;

    const STATE_READY = 0;
    const STATE_FINISHED = 1;

    public function __construct($ebnf) {
        $this->ebnf = $ebnf;
    }

    public function get_last_error() {
        return $this->last_error;
    }

    public function is_finished() {
        return $this->state === self::STATE_FINISHED;
    }

    public function next_rule() {
        if($this->is_finished() || $this->get_last_error()) {
            return false;
        }

        while(true) {
            if(false === $this->skip_comments()) {
                return false;
            }
            if(false === $this->parse_rule_name()) {
                return false;
            }
         
            die();
        }

        $this->state = self::STATE_FINISHED;
        return false;
    }

    private function parse_branches() {
        $at = $this->bytes_already_parsed;
        if($at >= strlen($this->ebnf)) {
            $this->last_error = 'Unexpected end of input';
            return false;
        }
        switch($this->ebnf[$at]) {
            case '(':
                break;
            case '"':
            case "'":
                break;
        }
    }

    private function skip_whitespace($at) {
        return strspn($this->ebnf, " \t", $at);
    }

    private function skip_comments() {
        while(true) {
            $at = $this->bytes_already_parsed;
            if($at + 2 >= strlen($this->ebnf)) {
                return false;
            }

            if(
                $this->ebnf[$at] === "/" &&
                $this->ebnf[$at + 1] === "/"
            ) {
                // Skip comment
                $this->bytes_already_parsed = strpos($this->ebnf, "\n", $at) + 1;
            } else {
                break;
            }
        }
    }

    private function parse_rule_name() {
        $at = $this->bytes_already_parsed;
        // Skip whitespace at the beginning of the rule
        $at += strspn($this->ebnf, " \t", $at);

        $rule_name_length = strspn($this->ebnf, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", $at);
        if( 0 === $rule_name_length ) {
            $this->last_error = 'Invalid rule name';
            return false;
        }
        $this->rule_name = substr($this->ebnf, $at, $rule_name_length);
        $at += $rule_name_length;
        $at += $this->skip_whitespace($at);

        // Skip past the '::='
        if( strlen($this->ebnf) <= $at + 3 ) {
            $this->last_error = 'Unexpected end of input';
            return false;
        }
        if( $this->ebnf[$at] !== ':' || $this->ebnf[$at + 1] !== ':' || $this->ebnf[$at + 2] !== '=' ) {
            $this->last_error = 'Expected "::="';
            return false;
        }
        $at += 3;

        $this->bytes_already_parsed = $at;
    }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions