Update to ytdl-2021.01.24.1

mercuree · Jan 24, 2021 · a820dc7 · a820dc7
1 parent f74980c
commit a820dc7
Show file tree

Hide file tree

Showing 23 changed files with 988 additions and 413 deletions.
diff --git a/README.md b/README.md
@@ -814,7 +814,7 @@ Available for the media that is a track or a part of a music album:
  - `disc_number` (numeric): Number of the disc or other physical medium the track belongs to
  - `release_year` (numeric): Year (YYYY) when the album was released
 
-Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with `NA`.
+Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note that some of the sequences are not guaranteed to be present since they depend on the metadata obtained by a particular extractor. Such sequences will be replaced with placeholder value provided with `--output-na-placeholder` (`NA` by default).
 
 For example for `-o %(title)s-%(id)s.%(ext)s` and an mp4 video with title `youtube-dlc test video` and id `BaW_jenozKcj`, this will result in a `youtube-dlc test video-BaW_jenozKcj.mp4` file created in the current directory.
 

diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py
@@ -637,13 +637,20 @@ def test_prepare_filename(self):
             'title2': '%PATH%',
         }
 
-        def fname(templ):
-            ydl = YoutubeDL({'outtmpl': templ})
+        def fname(templ, na_placeholder='NA'):
+            params = {'outtmpl': templ}
+            if na_placeholder != 'NA':
+                params['outtmpl_na_placeholder'] = na_placeholder
+            ydl = YoutubeDL(params)
             return ydl.prepare_filename(info)
         self.assertEqual(fname('%(id)s.%(ext)s'), '1234.mp4')
         self.assertEqual(fname('%(id)s-%(width)s.%(ext)s'), '1234-NA.mp4')
-        # Replace missing fields with 'NA'
-        self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4')
+        NA_TEST_OUTTMPL = '%(uploader_date)s-%(width)d-%(id)s.%(ext)s'
+        # Replace missing fields with 'NA' by default
+        self.assertEqual(fname(NA_TEST_OUTTMPL), 'NA-NA-1234.mp4')
+        # Or by provided placeholder
+        self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder='none'), 'none-none-1234.mp4')
+        self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder=''), '--1234.mp4')
         self.assertEqual(fname('%(height)d.%(ext)s'), '1080.mp4')
         self.assertEqual(fname('%(height)6d.%(ext)s'), '  1080.mp4')
         self.assertEqual(fname('%(height)-6d.%(ext)s'), '1080  .mp4')

diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py
@@ -181,9 +181,12 @@ class YoutubeDL(object):
     allow_multiple_video_streams:   Allow multiple video streams to be merged into a single file
     allow_multiple_audio_streams:   Allow multiple audio streams to be merged into a single file
     outtmpl:           Template for output names.
-    restrictfilenames: Do not allow "&" and spaces in file names.
-    trim_file_name:    Limit length of filename (extension excluded).
-    ignoreerrors:      Do not stop on download errors. (Default True when running youtube-dlc, but False when directly accessing YoutubeDL class)
+    outtmpl_na_placeholder: Placeholder for unavailable meta fields.
+    restrictfilenames: Do not allow "&" and spaces in file names
+    trim_file_name:    Limit length of filename (extension excluded)
+    ignoreerrors:      Do not stop on download errors
+                       (Default True when running youtube-dlc,
+                       but False when directly accessing YoutubeDL class)
     force_generic_extractor: Force downloader to use the generic extractor
     overwrites:        Overwrite all video and metadata files if True,
                        overwrite only non-video files if None
@@ -741,7 +744,7 @@ def prepare_filename(self, info_dict, warn=False):
             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
                                  for k, v in template_dict.items()
                                  if v is not None and not isinstance(v, (list, tuple, dict)))
-            template_dict = collections.defaultdict(lambda: 'NA', template_dict)
+            template_dict = collections.defaultdict(lambda: self.params.get('outtmpl_na_placeholder', 'NA'), template_dict)
 
             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
 
@@ -761,8 +764,8 @@ def prepare_filename(self, info_dict, warn=False):
 
             # Missing numeric fields used together with integer presentation types
             # in format specification will break the argument substitution since
-            # string 'NA' is returned for missing fields. We will patch output
-            # template for missing fields to meet string presentation type.
+            # string NA placeholder is returned for missing fields. We will patch
+            # output template for missing fields to meet string presentation type.
             for numeric_field in self._NUMERIC_FIELDS:
                 if numeric_field not in template_dict:
                     # As of [1] format syntax is:

diff --git a/youtube_dlc/__init__.py b/youtube_dlc/__init__.py
@@ -373,6 +373,7 @@ def parse_retries(retries):
         'listformats': opts.listformats,
         'listformats_table': opts.listformats_table,
         'outtmpl': outtmpl,
+        'outtmpl_na_placeholder': opts.outtmpl_na_placeholder,
         'paths': opts.paths,
         'autonumber_size': opts.autonumber_size,
         'autonumber_start': opts.autonumber_start,

diff --git a/youtube_dlc/extractor/aenetworks.py b/youtube_dlc/extractor/aenetworks.py
@@ -256,7 +256,7 @@ class AENetworksShowIE(AENetworksListBaseIE):
             'title': 'Ancient Aliens',
             'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f',
         },
-        'playlist_mincount': 168,
+        'playlist_mincount': 150,
     }]
     _RESOURCE = 'series'
     _ITEMS_KEY = 'episodes'

diff --git a/youtube_dlc/extractor/aljazeera.py b/youtube_dlc/extractor/aljazeera.py
@@ -1,13 +1,16 @@
 from __future__ import unicode_literals
 
+import json
+import re
+
 from .common import InfoExtractor
 
 
 class AlJazeeraIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html'
+    _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?P<type>program/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)'
 
     _TESTS = [{
-        'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html',
+        'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance',
         'info_dict': {
             'id': '3792260579001',
             'ext': 'mp4',
@@ -20,14 +23,34 @@ class AlJazeeraIE(InfoExtractor):
         'add_ie': ['BrightcoveNew'],
         'skip': 'Not accessible from Travis CI server',
     }, {
-        'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html',
+        'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art',
         'only_matching': True,
     }]
-    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s'
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
 
     def _real_extract(self, url):
-        program_name = self._match_id(url)
-        webpage = self._download_webpage(url, program_name)
-        brightcove_id = self._search_regex(
-            r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id')
-        return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
+        post_type, name = re.match(self._VALID_URL, url).groups()
+        post_type = {
+            'features': 'post',
+            'program': 'episode',
+            'videos': 'video',
+        }[post_type.split('/')[0]]
+        video = self._download_json(
+            'https://www.aljazeera.com/graphql', name, query={
+                'operationName': 'SingleArticleQuery',
+                'variables': json.dumps({
+                    'name': name,
+                    'postType': post_type,
+                }),
+            }, headers={
+                'wp-site': 'aje',
+            })['data']['article']['video']
+        video_id = video['id']
+        account_id = video.get('accountId') or '665003303001'
+        player_id = video.get('playerId') or 'BkeSH5BDb'
+        return self.url_result(
+            self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id),
+            'BrightcoveNew', video_id)
diff --git a/youtube_dlc/extractor/americastestkitchen.py b/youtube_dlc/extractor/americastestkitchen.py
@@ -1,13 +1,16 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import json
 import re
 
 from .common import InfoExtractor
 from ..utils import (
     clean_html,
+    int_or_none,
     try_get,
     unified_strdate,
+    unified_timestamp,
 )
 
 
@@ -22,8 +25,8 @@ class AmericasTestKitchenIE(InfoExtractor):
             'ext': 'mp4',
             'description': 'md5:64e606bfee910627efc4b5f050de92b3',
             'thumbnail': r're:^https?://',
-            'timestamp': 1523664000,
-            'upload_date': '20180414',
+            'timestamp': 1523318400,
+            'upload_date': '20180410',
             'release_date': '20180410',
             'series': "America's Test Kitchen",
             'season_number': 18,
@@ -33,6 +36,27 @@ class AmericasTestKitchenIE(InfoExtractor):
         'params': {
             'skip_download': True,
         },
+    }, {
+        # Metadata parsing behaves differently for newer episodes (705) as opposed to older episodes (582 above)
+        'url': 'https://www.americastestkitchen.com/episode/705-simple-chicken-dinner',
+        'md5': '06451608c57651e985a498e69cec17e5',
+        'info_dict': {
+            'id': '5fbe8c61bda2010001c6763b',
+            'title': 'Simple Chicken Dinner',
+            'ext': 'mp4',
+            'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7',
+            'thumbnail': r're:^https?://',
+            'timestamp': 1610755200,
+            'upload_date': '20210116',
+            'release_date': '20210116',
+            'series': "America's Test Kitchen",
+            'season_number': 21,
+            'episode': 'Simple Chicken Dinner',
+            'episode_number': 3,
+        },
+        'params': {
+            'skip_download': True,
+        },
     }, {
         'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon',
         'only_matching': True,
@@ -60,7 +84,76 @@ def _real_extract(self, url):
             'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'],
             'ie_key': 'Zype',
             'description': clean_html(video.get('description')),
+            'timestamp': unified_timestamp(video.get('publishDate')),
             'release_date': unified_strdate(video.get('publishDate')),
+            'episode_number': int_or_none(episode.get('number')),
+            'season_number': int_or_none(episode.get('season')),
             'series': try_get(episode, lambda x: x['show']['title']),
             'episode': episode.get('title'),
         }
+
+
+class AmericasTestKitchenSeasonIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|cookscountry)\.com/episodes/browse/season_(?P<id>\d+)'
+    _TESTS = [{
+        # ATK Season
+        'url': 'https://www.americastestkitchen.com/episodes/browse/season_1',
+        'info_dict': {
+            'id': 'season_1',
+            'title': 'Season 1',
+        },
+        'playlist_count': 13,
+    }, {
+        # Cooks Country Season
+        'url': 'https://www.cookscountry.com/episodes/browse/season_12',
+        'info_dict': {
+            'id': 'season_12',
+            'title': 'Season 12',
+        },
+        'playlist_count': 13,
+    }]
+
+    def _real_extract(self, url):
+        show_name, season_number = re.match(self._VALID_URL, url).groups()
+        season_number = int(season_number)
+
+        slug = 'atk' if show_name == 'americastestkitchen' else 'cco'
+
+        season = 'Season %d' % season_number
+
+        season_search = self._download_json(
+            'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug,
+            season, headers={
+                'Origin': 'https://www.%s.com' % show_name,
+                'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805',
+                'X-Algolia-Application-Id': 'Y1FNZXUI30',
+            }, query={
+                'facetFilters': json.dumps([
+                    'search_season_list:' + season,
+                    'search_document_klass:episode',
+                    'search_show_slug:' + slug,
+                ]),
+                'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug,
+                'attributesToHighlight': '',
+                'hitsPerPage': 1000,
+            })
+
+        def entries():
+            for episode in (season_search.get('hits') or []):
+                search_url = episode.get('search_url')
+                if not search_url:
+                    continue
+                yield {
+                    '_type': 'url',
+                    'url': 'https://www.%s.com%s' % (show_name, search_url),
+                    'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]),
+                    'title': episode.get('title'),
+                    'description': episode.get('description'),
+                    'timestamp': unified_timestamp(episode.get('search_document_date')),
+                    'season_number': season_number,
+                    'episode_number': int_or_none(episode.get('search_%s_episode_number' % slug)),
+                    'ie_key': AmericasTestKitchenIE.ie_key(),
+                }
+
+        return self.playlist_result(
+            entries(), 'season_%d' % season_number, season)
diff --git a/youtube_dlc/extractor/aol.py b/youtube_dlc/extractor/aol.py
@@ -3,7 +3,7 @@
 
 import re
 
-from .common import InfoExtractor
+from .yahoo import YahooIE
 from ..compat import (
     compat_parse_qs,
     compat_urllib_parse_urlparse,
@@ -15,9 +15,9 @@
 )
 
 
-class AolIE(InfoExtractor):
+class AolIE(YahooIE):
     IE_NAME = 'aol.com'
-    _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>[0-9a-f]+)'
+    _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})'
 
     _TESTS = [{
         # video with 5min ID
@@ -76,10 +76,16 @@ class AolIE(InfoExtractor):
     }, {
         'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/',
         'only_matching': True,
+    }, {
+        # Yahoo video
+        'url': 'https://www.aol.com/video/play/991e6700-ac02-11ea-99ff-357400036f61/24bbc846-3e30-3c46-915e-fe8ccd7fcc46/',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
+        if '-' in video_id:
+            return self._extract_yahoo_video(video_id, 'us')
 
         response = self._download_json(
             'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id,

diff --git a/youtube_dlc/extractor/ard.py b/youtube_dlc/extractor/ard.py
@@ -226,13 +226,13 @@ def _real_extract(self, url):
             if doc.tag == 'rss':
                 return GenericIE()._extract_rss(url, video_id, doc)
 
-        title = self._html_search_regex(
+        title = self._og_search_title(webpage, default=None) or self._html_search_regex(
             [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
              r'<meta name="dcterms\.title" content="(.*?)"/>',
              r'<h4 class="headline">(.*?)</h4>',
              r'<title[^>]*>(.*?)</title>'],
             webpage, 'title')
-        description = self._html_search_meta(
+        description = self._og_search_description(webpage, default=None) or self._html_search_meta(
             'dcterms.abstract', webpage, 'description', default=None)
         if description is None:
             description = self._html_search_meta(
@@ -289,18 +289,18 @@ def _real_extract(self, url):
 
 
 class ARDIE(InfoExtractor):
-    _VALID_URL = r'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
+    _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P<display_id>[^/?#]+)-(?:video-?)?(?P<id>[0-9]+))\.html'
     _TESTS = [{
-        # available till 14.02.2019
-        'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html',
-        'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49',
+        # available till 7.01.2022
+        'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html',
+        'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1',
         'info_dict': {
-            'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video',
-            'id': '102',
+            'display_id': 'maischberger-die-woche',
+            'id': '100',
             'ext': 'mp4',
-            'duration': 4435.0,
-            'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?',
-            'upload_date': '20180214',
+            'duration': 3687.0,
+            'title': 'maischberger. die woche vom 7. Januar 2021',
+            'upload_date': '20210107',
             'thumbnail': r're:^https?://.*\.jpg$',
         },
     }, {
@@ -355,17 +355,17 @@ def _real_extract(self, url):
 class ARDBetaMediathekIE(ARDMediathekBaseIE):
     _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?P<mode>player|live|video|sendung|sammlung)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
     _TESTS = [{
-        'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
-        'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f',
+        'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
+        'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
         'info_dict': {
             'display_id': 'die-robuste-roswita',
-            'id': '70153354',
+            'id': '78566716',
             'title': 'Die robuste Roswita',
-            'description': r're:^Der Mord.*trüber ist als die Ilm.',
+            'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita',
             'duration': 5316,
-            'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard',
-            'timestamp': 1577047500,
-            'upload_date': '20191222',
+            'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard',
+            'timestamp': 1596658200,
+            'upload_date': '20200805',
             'ext': 'mp4',
         },
     }, {