Skip to content

Commit

Permalink
Fix a904a7f
Browse files Browse the repository at this point in the history
  • Loading branch information
pukkandan committed Jul 15, 2022
1 parent a904a7f commit 88f60fe
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 44 deletions.
9 changes: 3 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1161,14 +1161,11 @@ Note that options in configuration file are just the same options aka switches u

You can use `--ignore-config` if you want to disable all configuration files for a particular yt-dlp run. If `--ignore-config` is found inside any configuration file, no further configuration will be loaded. For example, having the option in the portable configuration file prevents loading of home, user, and system configurations. Additionally, (for backward compatibility) if `--ignore-config` is found inside the system configuration file, the user configuration is not loaded.

### Specifying encoding of config files
### Config file encoding

By default, config files are read in the encoding from system locale.
If you saved your config file in a different encoding than that, you may write `# coding: ENCODING` to the beginning of the file. (e.g. `# coding: shift-jis`)
The config files are decoded according to the UTF BOM if present, and in the encoding from system locale otherwise.

There must not be any characters before that, including spaces.

If you have BOM enabled, it will be used instead.
If you want your file to be decoded differently, add `# coding: ENCODING` to the beginning of the file (e.g. `# coding: shift-jis`). There must be no characters before that, even spaces or BOM.

### Authentication with `.netrc` file

Expand Down
20 changes: 6 additions & 14 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1831,24 +1831,16 @@ def test_determine_file_encoding(self):
self.assertEqual(determine_file_encoding(b'\x00\x00\xfe\xff'), ('utf-32-be', 4))
self.assertEqual(determine_file_encoding(b'\xff\xfe'), ('utf-16-le', 2))

self.assertEqual(determine_file_encoding(b'# -*- coding: cp932 -*-'), ('cp932', 0))
self.assertEqual(determine_file_encoding(b'# -*- coding: cp932 -*-\n'), ('cp932', 0))
self.assertEqual(determine_file_encoding(b'# -*- coding: cp932 -*-\r\n'), ('cp932', 0))
self.assertEqual(determine_file_encoding(b'\xff\xfe# coding: utf-8\n--verbose'), ('utf-16-le', 2))

self.assertEqual(determine_file_encoding(b'# coding: utf-8\n--verbose'), ('utf-8', 0))
self.assertEqual(determine_file_encoding(b'# coding: someencodinghere-12345\n--verbose'), ('someencodinghere-12345', 0))

self.assertEqual(determine_file_encoding(b'# vi: set fileencoding=cp932'), ('cp932', 0))
self.assertEqual(determine_file_encoding(b'# vi: set fileencoding=cp932\n'), ('cp932', 0))
self.assertEqual(determine_file_encoding(b'# vi: set fileencoding=cp932\r\n'), ('cp932', 0))
self.assertEqual(determine_file_encoding(b'# vi: set fileencoding=cp932,euc-jp\r\n'), ('cp932', 0))

self.assertEqual(determine_file_encoding(
b'\0\0\0#\0\0\0 \0\0\0c\0\0\0o\0\0\0d\0\0\0i\0\0\0n\0\0\0g\0\0\0:\0\0\0 \0\0\0u\0\0\0t\0\0\0f\0\0\0-\0\0\x003\0\0\x002\0\0\0-\0\0\0b\0\0\0e'),
('utf-32-be', 0))
self.assertEqual(determine_file_encoding(
b'#\0 \0c\0o\0d\0i\0n\0g\0:\0 \0u\0t\0f\0-\x001\x006\0-\0l\0e\0'),
('utf-16-le', 0))
self.assertEqual(determine_file_encoding(b'#coding:utf-8\n--verbose'), ('utf-8', 0))
self.assertEqual(determine_file_encoding(b'# coding: utf-8 \r\n--verbose'), ('utf-8', 0))

self.assertEqual(determine_file_encoding('# coding: utf-32-be'.encode('utf-32-be')), ('utf-32-be', 0))
self.assertEqual(determine_file_encoding('# coding: utf-16-le'.encode('utf-16-le')), ('utf-16-le', 0))


if __name__ == '__main__':
Expand Down
31 changes: 7 additions & 24 deletions yt_dlp/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3485,14 +3485,14 @@ def age_restricted(content_limit, age_limit):
return age_limit < content_limit


# List of known byte-order-marks (BOM)
BOMS = [
(b'\xef\xbb\xbf', 'utf-8'),
(b'\x00\x00\xfe\xff', 'utf-32-be'),
(b'\xff\xfe\x00\x00', 'utf-32-le'),
(b'\xff\xfe', 'utf-16-le'),
(b'\xfe\xff', 'utf-16-be'),
]
""" List of known byte-order-marks (BOM) """


def is_html(first_bytes):
Expand Down Expand Up @@ -5398,37 +5398,20 @@ def read_stdin(what):

def determine_file_encoding(data):
"""
From the first 512 bytes of a given file,
it tries to detect the encoding to be used to read as text.
Detect the text encoding used
@returns (encoding, bytes to skip)
"""

# BOM marks are given priority over declarations
for bom, enc in BOMS:
# matching BOM beats any declaration
# BOMs are skipped to prevent any errors
if data.startswith(bom):
return enc, len(bom)

# strip off all null bytes to match even when UTF-16 or UTF-32 is used
# endians don't matter
# Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
# We ignore the endianness to get a good enough match
data = data.replace(b'\0', b'')

PREAMBLES = [
# "# -*- coding: utf-8 -*-"
# "# coding: utf-8"
rb'(?m)^#(?:\s+-\*-)?\s*coding\s*:\s*(?P<encoding>\S+)(?:\s+-\*-)?\s*$',
# "# vi: set fileencoding=utf-8"
rb'^#\s+vi\s*:\s+set\s+fileencoding=(?P<encoding>[^\s,]+)'
]
for pb in PREAMBLES:
mobj = re.match(pb, data)
if not mobj:
continue
# preambles aren't skipped since they're just ignored when reading as config
return mobj.group('encoding').decode(), 0

return None, 0
mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
return mobj.group(1).decode() if mobj else None, 0


class Config:
Expand Down

0 comments on commit 88f60fe

Please sign in to comment.