Skip to content

Commit

Permalink
Update parsers
Browse files Browse the repository at this point in the history
This includes the updates from WordPress/wordpress.org#90, a new AttributeParser for attribute-only blocks, and a fix for the ListItem block to allow child lists.
Fixes #211
  • Loading branch information
ryelle committed Apr 3, 2023
1 parent 22e2b25 commit 2b9149d
Show file tree
Hide file tree
Showing 10 changed files with 300 additions and 144 deletions.
165 changes: 131 additions & 34 deletions env/export-content/includes/parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,54 +3,119 @@
namespace WordPress_org\Main_2022\ExportToPatterns;

require_once __DIR__ . '/parsers/BlockParser.php';
require_once __DIR__ . '/parsers/AttributeParser.php';
require_once __DIR__ . '/parsers/BasicText.php';
require_once __DIR__ . '/parsers/Button.php';
require_once __DIR__ . '/parsers/Heading.php';
require_once __DIR__ . '/parsers/HTMLParser.php';
require_once __DIR__ . '/parsers/ListItem.php';
require_once __DIR__ . '/parsers/Noop.php';
require_once __DIR__ . '/parsers/Paragraph.php';

// Unused.
require_once __DIR__ . '/parsers/ShortcodeBlock.php';
require_once __DIR__ . '/parsers/TextNode.php';

class BlockParser {
public $pattern;
public $content;
public $parsers = [];
public $fallback;

public function __construct() {
$this->parsers = [
// Core blocks that have custom parsers.
'core/paragraph' => new Parsers\Paragraph(),
public function __construct( string $content = '' ) {
$this->content = $content;
$this->fallback = new Parsers\BasicText();
$this->parsers = [
// Blocks that have custom parsers.
'core/paragraph' => new Parsers\HTMLParser( 'p' ),
'core/image' => new Parsers\HTMLParser( 'figcaption', [ 'alt', 'title' ] ),
'core/heading' => new Parsers\HTMLRegexParser( '/h[1-6]/' ),

'core/list-item' => new Parsers\ListItem(),
'core/heading' => new Parsers\Heading(),
'core/button' => new Parsers\Button(),
//'core/button' => new Parsers\Button(),
//'core/buttons' => new Parsers\BasicText(),
'core/button' => new Parsers\HTMLParser( 'a', [ 'title' ] ),

// Attributes handler.
'core/navigation-link' => new Parsers\AttributeParser( [ 'label' ] ),
'core/social-link' => new Parsers\AttributeParser( [ 'label' ] ),

// Generic shortcode handler.
'core/shortcode' => new Parsers\ShortcodeBlock(),

'core/spacer' => new Parsers\Noop(),
// These contain other blocks to be parsed.
'core/column' => new Parsers\Noop(),
'core/columns' => new Parsers\Noop(),
'core/group' => new Parsers\Noop(),
'core/quote' => new Parsers\Noop(),

// Common core blocks that use the default parser.
'core/buttons' => new Parsers\BasicText(),
'core/list' => new Parsers\BasicText(),
'core/column' => new Parsers\BasicText(),
'core/columns' => new Parsers\BasicText(),
'core/cover' => new Parsers\BasicText(),
'core/group' => new Parsers\BasicText(),
'core/image' => new Parsers\BasicText(),
'core/media-text' => new Parsers\BasicText(),
'core/separator' => new Parsers\BasicText(),
'core/social-link' => new Parsers\BasicText(),
];
}

$this->fallback = new Parsers\BasicText();
public static function post_to_strings( $post ) {
// TODO: Detect post using a block template, pull strings from there.
$self = new self( $post->post_content );
$strings = $self->to_strings();

if ( $post->post_title ) {
$strings[] = $post->post_title;
}
if ( $post->post_excerpt ) {
$strings[] = $post->post_excerpt;
}

$post_meta_to_include = apply_filters( 'translatable_post_meta', [] );
foreach ( $post_meta_to_include as $meta_key ) {
$strings[] = get_post_meta( $post->ID, $meta_key, true );
}

return $strings;
}

public static function translate_post( $post, callable $callback_translate ) {
$post->post_content = self::translate_blocks( $post->post_content, $callback_translate ) ?: $post->post_content;
$post->post_title = $callback_translate( $post->post_title ) ?: $post->post_title;
$post->post_excerpt = $callback_translate( $post->post_excerpt ) ?: $post->post_excerpt;

return $post;
}

public function replace_with_i18n( string $content ) : string {
$strings = $this->to_strings( $content );
$i18n_strings = [];
public static function translate_blocks( string $content, callable $callback_translate ) /*: bool|string*/ {
$self = new self( $content );

$translations = [];
$translated = false;
$strings = $self->to_strings();

foreach ( $strings as $string ) {
$i18n_strings[ $string ] = sprintf( "<?php _e( '%s', 'wporg' ); ?>", str_replace( "'", '&#039;', $string ) );
$translations[ $string ] = $callback_translate( $string );

$translated = $translated || ( $string !== $translations[ $string ] );
}
return $this->replace_strings( $content, $i18n_strings );

// Are there any translations?
if ( ! $translated ) {
return false;
}

return $self->replace_strings_with_kses( $translations );
}

public static function translate_block( string $content, $block, callable $callback_translate ) /* :bool|string */ {
$self = new self();
$parser = $self->parsers[ $block['blockName'] ] ?? $self->fallback;
$strings = $parser->to_strings( $block ); // does not do innerBlocks, intentionally.

if ( ! $strings ) {
return $content;
}

$replacements = [];
foreach ( $strings as $string ) {
$replacements[ $string ] = $callback_translate( $string ) ?: $string;
}

$block = $parser->replace_strings( $block, $replacements );

return $block['innerContent'][0] ?: $content;
}

public function block_parser_to_strings( array $block ) : array {
Expand All @@ -62,7 +127,7 @@ public function block_parser_to_strings( array $block ) : array {
$strings = array_merge( $strings, $this->block_parser_to_strings( $inner_block ) );
}

return $strings;
return array_unique( $strings );
}

public function block_parser_replace_strings( array &$block, array $replacements ) : array {
Expand All @@ -76,21 +141,31 @@ public function block_parser_replace_strings( array &$block, array $replacements
return $block;
}

public function to_strings( string $content ) : array {
$blocks = parse_blocks( $content );

public function to_strings() : array {
$strings = [];

$blocks = parse_blocks( $this->content );

foreach ( $blocks as $block ) {
$strings = array_merge( $strings, $this->block_parser_to_strings( $block ) );
}

return array_unique( $strings );
}

public function replace_strings( string $content, array $replacements ) : string {
$blocks = parse_blocks( $content );
public function replace_strings_with_kses( array $replacements ) : string {
// Sanitize replacement strings before injecting them into blocks and block attributes.
$sanitized_replacements = $replacements;
foreach ( $sanitized_replacements as &$replacement ) {
$replacement = wp_kses_post( $replacement );
}
return $this->replace_strings( $sanitized_replacements );
}

public function replace_strings( array $replacements ) : string {
$translated = $this->content;

$blocks = parse_blocks( $translated );
foreach ( $blocks as &$block ) {
$block = $this->block_parser_replace_strings( $block, $replacements );
}
Expand All @@ -100,9 +175,9 @@ public function replace_strings( string $content, array $replacements ) : string
// "subscribePlaceholder":"😀" becomes "subscribePlaceholder":"\ud83d\ude00".
// After we get the serialized blocks back from `serialize_blocks` we need to convert these
// characters back to their unicode form so that we don't break blocks in the editor.
$new_content = $this->decode_unicode_characters( serialize_blocks( $blocks ) );
$translated = $this->decode_unicode_characters( serialize_blocks( $blocks ) );

return $new_content;
return $translated;
}

/**
Expand Down Expand Up @@ -144,6 +219,28 @@ function ( $matches ) use ( $excluded_characters ) {
$string
);

// Decode < & > if they're part of a PHP tag.
$decoded_string = str_replace( [ '\\u003c?', '?\\u003e' ], [ '<?', '?>' ], $decoded_string );

return $decoded_string;
}
}

/**
* Helper function to replace all strings in content with i18n-wrapped strings.
*/
function replace_with_i18n( string $content, string $textdomain = 'wporg' ) : string {
$parser = new BlockParser( $content );
$strings = $parser->to_strings();

$i18n_strings = [];
foreach ( $strings as $string ) {
$i18n_strings[ $string ] = sprintf(
"<?php _e( '%s', '%s' ); ?>",
str_replace( "'", '&#039;', $string ),
$textdomain
);
}

return $parser->replace_strings( $i18n_strings );
}
31 changes: 31 additions & 0 deletions env/export-content/includes/parsers/AttributeParser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<?php

namespace WordPress_org\Main_2022\ExportToPatterns\Parsers;

class AttributeParser implements BlockParser {
use GetSetAttribute;

public $attributes = [];

public function __construct( $attributes = array() ) {
$this->attributes = (array) $attributes;
}

public function to_strings( array $block ) : array {
$strings = [];
foreach ( $this->attributes as $attr ) {
$results = $this->get_attribute( $attr, $block );
$strings = array_merge( $strings, $results );
}

return $strings;
}

public function replace_strings( array $block, array $replacements ) : array {
foreach ( $this->attributes as $attr ) {
$this->set_attribute( $attr, $block, $replacements );
}

return $block;
}
}
2 changes: 1 addition & 1 deletion env/export-content/includes/parsers/BasicText.php
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public function replace_strings( array $block, array $replacements ) : array {
// unclosed HTML tags, and saveHTML adds extra closed tags.
foreach ( $text_nodes as $text ) {
if ( trim( $text->nodeValue ) && isset( $replacements[ $text->nodeValue ] ) ) {
$regex = '#(<([^>]*)>)?' . preg_quote( $text->nodeValue, '/' ) . '(<([^>]*)>)?#is';
$regex = '#(<([^>]*)>)?' . preg_quote( $text->nodeValue, '#' ) . '(<([^>]*)>)?#s';
$inner_content = preg_replace( $regex, '${1}' . $replacements[ $text->nodeValue ] . '${3}', $inner_content );
}
}
Expand Down
19 changes: 1 addition & 18 deletions env/export-content/includes/parsers/Button.php
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,7 @@ public function replace_strings( array $block, array $replacements ) : array {
}
}

// Replace shortcodes in `href`, as these are url-encoded by `saveHTML()`.
// This is probably overkill since there should only be one `a` per button,
// but just to be safe, we loop over any `a`s found.
$elements = $dom->getElementsByTagName( 'a' );
$replacements = [];
foreach ( $elements as $element ) {
$link = $element->getAttribute( 'href' );
// If we find a shortcode in the URL, save it.
if ( $link && preg_match( '/\[.*\]/', $link ) ) {
$replacements[ urlencode( $link ) ] = $link;
}
}

$html = $this->removeHtml( $dom->saveHTML() );
if ( count( $replacements ) ) {
$html = str_replace( array_keys( $replacements ), array_values( $replacements ), $html );
}
$decoded_html = $this->decode_tags( $html );
$decoded_html = $this->decode_tags( $this->removeHtml( $dom->saveHTML() ) );
$block['innerHTML'] = $decoded_html;
$block['innerContent'] = [ $decoded_html ];

Expand Down
95 changes: 95 additions & 0 deletions env/export-content/includes/parsers/HTMLParser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
<?php

namespace WordPress_org\Main_2022\ExportToPatterns\Parsers;

class HTMLParser implements BlockParser {
use GetSetAttribute;

public $tags = [];
public $attributes = [];

public function __construct( $tags = array(), $attributes = array() ) {
$this->tags = (array) $tags;
$this->attributes = (array) $attributes;
}

public function to_strings( array $block ) : array {
$strings = $this->get_attribute( 'placeholder', $block );

foreach ( $this->tags as $tag ) {
$tag = $this->escape_tag( $tag, '#' );

if ( preg_match_all( "#<{$tag}[^>]*>\s*(?P<string>.+?)\s*</{$tag}>#is", $block['innerHTML'], $matches ) ) {
$strings = array_merge( $strings, $matches['string'] );
}
}

foreach ( $this->attributes as $attr ) {
$attr = $this->escape_attr( $attr, '#' );

if (
str_contains( $block['innerHTML'], "='" ) &&
preg_match_all( "#{$attr}='(?P<string>[^']+?)'#is", $block['innerHTML'], $matches )
) {
$strings = array_merge( $strings, $matches['string'] );
}

if (
str_contains( $block['innerHTML'], '="' ) &&
preg_match_all( "#{$attr}=\"(?P<string>[^\"]+?)\"#is", $block['innerHTML'], $matches )
) {
$strings = array_merge( $strings, $matches['string'] );
}
}

return $strings;
}

// todo: this needs a fix to properly rebuild innerContent - see ParagraphParserTest
public function replace_strings( array $block, array $replacements ) : array {
$this->set_attribute( 'placeholder', $block, $replacements );

$html = $block['innerHTML'];

foreach ( $this->to_strings( $block ) as $original ) {
if ( empty( $original ) || ! isset( $replacements[ $original ] ) ) {
continue;
}

// TODO: Potentially this should be more specific for tags/attribute replacements as needed.
$regex = '#([>"\'])\s*' . preg_quote( $original, '#' ) . '\s*([\'"<])#s';
$html = preg_replace( $regex, '$1' . addcslashes( $replacements[ $original ], '\\$' ) . '$2', $html );
}

$block['innerHTML'] = $html;
$block['innerContent'] = [ $html ];

return $block;
}

/**
* Escape a tag/attribute to use in a regex.
*/
protected function escape_tag( $string, $delim ) {
return $this->escape( $string, $delim );
}
protected function escape_attr( $string, $delim ) {
return $this->escape( $string, $delim );
}
protected function escape( $string, $delim ) {
return preg_quote( $string, $delim );
}
}

class HTMLRegexParser extends HTMLParser {
/**
* Maybe escape a string for a regex match, unless it looks like regex (ie. /..../) then use as-is.
*/
protected function escape_tag( $string, $delim ) {
if ( str_starts_with( $string, '/' ) && str_ends_with( $string, '/' ) ) {
return trim( $string, '/' );
}

return parent::escape_tag( $string, $delim );
}
}
Loading

0 comments on commit 2b9149d

Please sign in to comment.