Skip to content

Commit 2c878b1

Browse files
authored
Clean up qmk generate-autocorrect-data (#19710)
1 parent 328279a commit 2c878b1

File tree

2 files changed

+42
-46
lines changed

2 files changed

+42
-46
lines changed

docs/feature_autocorrect.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ The `qmk generate-autocorrect-data` commands can make an effort to check for ent
8686
8787
## Overriding Autocorrect
8888
89-
Occasionally you might actually want to type a typo (for instance, while editing autocorrection_dict.txt) without being autocorrected. There are a couple of ways to do this:
89+
Occasionally you might actually want to type a typo (for instance, while editing autocorrect_dict.txt) without being autocorrected. There are a couple of ways to do this:
9090
9191
1. Begin typing the typo.
9292
2. Before typing the last letter, press and release the Ctrl or Alt key.
@@ -238,13 +238,13 @@ bool apply_autocorrect(uint8_t backspaces, const char *str) {
238238

239239
## Appendix: Trie binary data format :id=appendix
240240

241-
This section details how the trie is serialized to byte data in autocorrection_data. You don’t need to care about this to use this autocorrection implementation. But it is documented for the record in case anyone is interested in modifying the implementation, or just curious how it works.
241+
This section details how the trie is serialized to byte data in autocorrect_data. You don’t need to care about this to use this autocorrection implementation. But it is documented for the record in case anyone is interested in modifying the implementation, or just curious how it works.
242242

243243
What I did here is fairly arbitrary, but it is simple to decode and gets the job done.
244244

245245
### Encoding :id=encoding
246246

247-
All autocorrection data is stored in a single flat array autocorrection_data. Each trie node is associated with a byte offset into this array, where data for that node is encoded, beginning with root at offset 0. There are three kinds of nodes. The highest two bits of the first byte of the node indicate what kind:
247+
All autocorrection data is stored in a single flat array autocorrect_data. Each trie node is associated with a byte offset into this array, where data for that node is encoded, beginning with root at offset 0. There are three kinds of nodes. The highest two bits of the first byte of the node indicate what kind:
248248

249249
* 00 ⇒ chain node: a trie node with a single child.
250250
* 01 ⇒ branching node: a trie node with multiple children.

lib/python/qmk/cli/generate/autocorrect_data.py

+39-43
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,11 @@
3333

3434
from milc import cli
3535

36-
import qmk.path
36+
from qmk.commands import dump_lines
37+
from qmk.constants import GPL2_HEADER_C_LIKE, GENERATED_HEADER_C_LIKE
3738
from qmk.keyboard import keyboard_completer, keyboard_folder
3839
from qmk.keymap import keymap_completer, locate_keymap
40+
from qmk.path import normpath
3941

4042
KC_A = 4
4143
KC_SPC = 0x2c
@@ -63,9 +65,10 @@ def parse_file(file_name: str) -> List[Tuple[str, str]]:
6365
try:
6466
from english_words import english_words_lower_alpha_set as correct_words
6567
except ImportError:
66-
cli.echo('Autocorrection will falsely trigger when a typo is a substring of a correctly spelled word.')
67-
cli.echo('To check for this, install the english_words package and rerun this script:')
68-
cli.echo(' {fg_cyan}python3 -m pip install english_words')
68+
if not cli.args.quiet:
69+
cli.echo('Autocorrection will falsely trigger when a typo is a substring of a correctly spelled word.')
70+
cli.echo('To check for this, install the english_words package and rerun this script:')
71+
cli.echo(' {fg_cyan}python3 -m pip install english_words')
6972
# Use a minimal word list as a fallback.
7073
correct_words = ('information', 'available', 'international', 'language', 'loosest', 'reference', 'wealthier', 'entertainment', 'association', 'provides', 'technology', 'statehood')
7174

@@ -232,58 +235,51 @@ def encode_link(link: Dict[str, Any]) -> List[int]:
232235
return [byte_offset & 255, byte_offset >> 8]
233236

234237

235-
def write_generated_code(autocorrections: List[Tuple[str, str]], data: List[int], file_name: str) -> None:
236-
"""Writes autocorrection data as generated C code to `file_name`.
237-
Args:
238-
autocorrections: List of (typo, correction) tuples.
239-
data: List of ints in 0-255, the serialized trie.
240-
file_name: String, path of the output C file.
241-
"""
242-
assert all(0 <= b <= 255 for b in data)
243-
244-
def typo_len(e: Tuple[str, str]) -> int:
245-
return len(e[0])
238+
def typo_len(e: Tuple[str, str]) -> int:
239+
return len(e[0])
246240

247-
min_typo = min(autocorrections, key=typo_len)[0]
248-
max_typo = max(autocorrections, key=typo_len)[0]
249-
generated_code = ''.join([
250-
'// Generated code.\n\n', f'// Autocorrection dictionary ({len(autocorrections)} entries):\n', ''.join(sorted(f'// {typo:<{len(max_typo)}} -> {correction}\n' for typo, correction in autocorrections)),
251-
f'\n#define AUTOCORRECT_MIN_LENGTH {len(min_typo)} // "{min_typo}"\n', f'#define AUTOCORRECT_MAX_LENGTH {len(max_typo)} // "{max_typo}"\n\n', f'#define DICTIONARY_SIZE {len(data)}\n\n',
252-
textwrap.fill('static const uint8_t autocorrect_data[DICTIONARY_SIZE] PROGMEM = {%s};' % (', '.join(map(str, data))), width=120, subsequent_indent=' '), '\n\n'
253-
])
254241

255-
with open(file_name, 'wt') as f:
256-
f.write(generated_code)
242+
def to_hex(b: int) -> str:
243+
return f'0x{b:02X}'
257244

258245

259-
@cli.argument('filename', default='autocorrect_dict.txt', help='The autocorrection database file')
246+
@cli.argument('filename', type=normpath, help='The autocorrection database file')
260247
@cli.argument('-kb', '--keyboard', type=keyboard_folder, completer=keyboard_completer, help='The keyboard to build a firmware for. Ignored when a configurator export is supplied.')
261248
@cli.argument('-km', '--keymap', completer=keymap_completer, help='The keymap to build a firmware for. Ignored when a configurator export is supplied.')
262-
@cli.argument('-o', '--output', arg_only=True, type=qmk.path.normpath, help='File to write to')
249+
@cli.argument('-o', '--output', arg_only=True, type=normpath, help='File to write to')
250+
@cli.argument('-q', '--quiet', arg_only=True, action='store_true', help="Quiet mode, only output error messages")
263251
@cli.subcommand('Generate the autocorrection data file from a dictionary file.')
264252
def generate_autocorrect_data(cli):
265253
autocorrections = parse_file(cli.args.filename)
266254
trie = make_trie(autocorrections)
267255
data = serialize_trie(autocorrections, trie)
268-
# Environment processing
269-
if cli.args.output == '-':
270-
cli.args.output = None
271256

272-
if cli.args.output:
273-
cli.args.output.parent.mkdir(parents=True, exist_ok=True)
274-
cli.log.info('Creating autocorrect database at {fg_cyan}%s', cli.args.output)
275-
write_generated_code(autocorrections, data, cli.args.output)
257+
current_keyboard = cli.args.keyboard or cli.config.user.keyboard or cli.config.generate_autocorrect_data.keyboard
258+
current_keymap = cli.args.keymap or cli.config.user.keymap or cli.config.generate_autocorrect_data.keymap
259+
260+
if current_keyboard and current_keymap:
261+
cli.args.output = locate_keymap(current_keyboard, current_keymap).parent / 'autocorrect_data.h'
276262

277-
else:
278-
current_keyboard = cli.args.keyboard or cli.config.user.keyboard or cli.config.generate_autocorrect_data.keyboard
279-
current_keymap = cli.args.keymap or cli.config.user.keymap or cli.config.generate_autocorrect_data.keymap
263+
assert all(0 <= b <= 255 for b in data)
280264

281-
if current_keyboard and current_keymap:
282-
filename = locate_keymap(current_keyboard, current_keymap).parent / 'autocorrect_data.h'
283-
cli.log.info('Creating autocorrect database at {fg_cyan}%s', filename)
284-
write_generated_code(autocorrections, data, filename)
265+
min_typo = min(autocorrections, key=typo_len)[0]
266+
max_typo = max(autocorrections, key=typo_len)[0]
285267

286-
else:
287-
write_generated_code(autocorrections, data, 'autocorrect_data.h')
268+
# Build the autocorrect_data.h file.
269+
autocorrect_data_h_lines = [GPL2_HEADER_C_LIKE, GENERATED_HEADER_C_LIKE, '#pragma once', '']
288270

289-
cli.log.info('Processed %d autocorrection entries to table with %d bytes.', len(autocorrections), len(data))
271+
autocorrect_data_h_lines.append(f'// Autocorrection dictionary ({len(autocorrections)} entries):')
272+
for typo, correction in autocorrections:
273+
autocorrect_data_h_lines.append(f'// {typo:<{len(max_typo)}} -> {correction}')
274+
275+
autocorrect_data_h_lines.append('')
276+
autocorrect_data_h_lines.append(f'#define AUTOCORRECT_MIN_LENGTH {len(min_typo)} // "{min_typo}"')
277+
autocorrect_data_h_lines.append(f'#define AUTOCORRECT_MAX_LENGTH {len(max_typo)} // "{max_typo}"')
278+
autocorrect_data_h_lines.append(f'#define DICTIONARY_SIZE {len(data)}')
279+
autocorrect_data_h_lines.append('')
280+
autocorrect_data_h_lines.append('static const uint8_t autocorrect_data[DICTIONARY_SIZE] PROGMEM = {')
281+
autocorrect_data_h_lines.append(textwrap.fill(' %s' % (', '.join(map(to_hex, data))), width=100, subsequent_indent=' '))
282+
autocorrect_data_h_lines.append('};')
283+
284+
# Show the results
285+
dump_lines(cli.args.output, autocorrect_data_h_lines, cli.args.quiet)

0 commit comments

Comments
 (0)