[viki] improve format extraction

2020-11-19 22:45:46 +01:00 · 2020-11-19 22:45:46 +01:00 · 59e583f7e8
parent daa25d4142
commit 59e583f7e8
1 changed files with 83 additions and 59 deletions
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import base64
 import hashlib
 import hmac
 import itertools
@ -9,6 +10,10 @@ import re
 import time
 from .common import InfoExtractor
 from ..compat import (
    compat_parse_qs,
    compat_urllib_parse_urlparse,
 )
 from ..utils import (
    ExtractorError,
    int_or_none,
@ -165,19 +170,20 @@ class VikiIE(VikiBaseIE):
    }, {
        # episode
        'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
-        'md5': '5fa476a902e902783ac7a4d615cdbc7a',
+        'md5': '94e0e34fd58f169f40c184f232356cfe',
        'info_dict': {
            'id': '44699v',
            'ext': 'mp4',
            'title': 'Boys Over Flowers - Episode 1',
            'description': 'md5:b89cf50038b480b88b5b3c93589a9076',
-            'duration': 4204,
+            'duration': 4172,
            'timestamp': 1270496524,
            'upload_date': '20100405',
            'uploader': 'group8',
            'like_count': int,
            'age_limit': 13,
-        }
+        },
        'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
    }, {
        # youtube external
        'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
@ -194,14 +200,15 @@ class VikiIE(VikiBaseIE):
            'uploader_id': 'ad14065n',
            'like_count': int,
            'age_limit': 13,
-        }
+        },
        'skip': 'Page not found!',
    }, {
        'url': 'http://www.viki.com/player/44699v',
        'only_matching': True,
    }, {
        # non-English description
        'url': 'http://www.viki.com/videos/158036v-love-in-magic',
-        'md5': '1713ae35df5a521b31f6dc40730e7c9c',
+        'md5': 'adf9e321a0ae5d0aace349efaaff7691',
        'info_dict': {
            'id': '158036v',
            'ext': 'mp4',
@ -217,8 +224,11 @@ class VikiIE(VikiBaseIE):
    def _real_extract(self, url):
        video_id = self._match_id(url)
-        video = self._call_api(
+        resp = self._download_json(
-            'videos/%s.json' % video_id, video_id, 'Downloading video JSON')
+            'https://www.viki.com/api/videos/' + video_id,
            video_id, 'Downloading video JSON',
            headers={'x-viki-app-ver': '4.0.57'})
        video = resp['video']
        self._check_errors(video)
@ -265,60 +275,74 @@ class VikiIE(VikiBaseIE):
            'subtitles': subtitles,
        }
        streams = self._call_api(
            'videos/%s/streams.json' % video_id, video_id,
            'Downloading video streams JSON')
        if 'external' in streams:
            result.update({
                '_type': 'url_transparent',
                'url': streams['external']['url'],
            })
            return result
        formats = []
-        for format_id, stream_dict in streams.items():
+
-            height = int_or_none(self._search_regex(
+        def add_format(format_id, format_dict, protocol='http'):
-                r'^(\d+)[pP]$', format_id, 'height', default=None))
+            # rtmps URLs does not seem to work
-            for protocol, format_dict in stream_dict.items():
+            if protocol == 'rtmps':
-                # rtmps URLs does not seem to work
+                return
-                if protocol == 'rtmps':
+            format_url = format_dict.get('url')
-                    continue
+            if not format_url:
-                format_url = format_dict['url']
+                return
-                if format_id == 'm3u8':
+            qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query)
-                    m3u8_formats = self._extract_m3u8_formats(
+            stream = qs.get('stream', [None])[0]
-                        format_url, video_id, 'mp4',
+            if stream:
-                        entry_protocol='m3u8_native',
+                format_url = base64.b64decode(stream).decode()
-                        m3u8_id='m3u8-%s' % protocol, fatal=False)
+            if format_id in ('m3u8', 'hls'):
-                    # Despite CODECS metadata in m3u8 all video-only formats
+                m3u8_formats = self._extract_m3u8_formats(
-                    # are actually video+audio
+                    format_url, video_id, 'mp4',
-                    for f in m3u8_formats:
+                    entry_protocol='m3u8_native',
-                        if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
+                    m3u8_id='m3u8-%s' % protocol, fatal=False)
-                            f['acodec'] = None
+                # Despite CODECS metadata in m3u8 all video-only formats
-                    formats.extend(m3u8_formats)
+                # are actually video+audio
-                elif format_id == 'mpd':
+                for f in m3u8_formats:
-                    formats.extend(self._extract_mpd_formats(
+                    if '_drm/index_' in f['url']:
                        format_url, video_id, 'mpd-%s' % protocol, fatal=False))
                elif format_url.startswith('rtmp'):
                    mobj = re.search(
                        r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
                        format_url)
                    if not mobj:
                        continue
-                    formats.append({
+                    if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
-                        'format_id': 'rtmp-%s' % format_id,
+                        f['acodec'] = None
-                        'ext': 'flv',
+                    formats.append(f)
-                        'url': mobj.group('url'),
+            elif format_id in ('mpd', 'dash'):
-                        'play_path': mobj.group('playpath'),
+                formats.extend(self._extract_mpd_formats(
-                        'app': mobj.group('app'),
+                    format_url, video_id, 'mpd-%s' % protocol, fatal=False))
-                        'page_url': url,
+            elif format_url.startswith('rtmp'):
-                    })
+                mobj = re.search(
-                else:
+                    r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
-                    formats.append({
+                    format_url)
-                        'url': format_url,
+                if not mobj:
-                        'format_id': '%s-%s' % (format_id, protocol),
+                    return
-                        'height': height,
+                formats.append({
-                    })
+                    'format_id': 'rtmp-%s' % format_id,
                    'ext': 'flv',
                    'url': mobj.group('url'),
                    'play_path': mobj.group('playpath'),
                    'app': mobj.group('app'),
                    'page_url': url,
                })
            else:
                formats.append({
                    'url': format_url,
                    'format_id': '%s-%s' % (format_id, protocol),
                    'height': int_or_none(self._search_regex(
                        r'^(\d+)[pP]$', format_id, 'height', default=None)),
                })
        for format_id, format_dict in (resp.get('streams') or {}).items():
            add_format(format_id, format_dict)
        if not formats:
            streams = self._call_api(
                'videos/%s/streams.json' % video_id, video_id,
                'Downloading video streams JSON')
            if 'external' in streams:
                result.update({
                    '_type': 'url_transparent',
                    'url': streams['external']['url'],
                })
                return result
            for format_id, stream_dict in streams.items():
                for protocol, format_dict in stream_dict.items():
                    add_format(format_id, format_dict, protocol)
        self._sort_formats(formats)
        result['formats'] = formats