[servingsys] Add support

This also adds support for brightcove advertisements. Fixes #2181
2014-01-21 02:09:49 +01:00 · 2014-01-21 02:09:49 +01:00 · 7b0817e8e1
parent 9d4288b2d4
commit 7b0817e8e1
5 changed files with 121 additions and 8 deletions
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -151,6 +151,7 @@ class YoutubeDL(object):
    bidi_workaround:   Work around buggy terminals without bidirectional text
                       support, using fridibi
    debug_printtraffic:Print out sent and received HTTP traffic
    include_ads:       Download ads as well
    The following parameters are not used by YoutubeDL itself, they are used by
    the FileDownloader:
--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@ -238,7 +238,10 @@ def parseOpts(overrideArguments=None):
    selection.add_option('--download-archive', metavar='FILE',
                         dest='download_archive',
                         help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
-
+    selection.add_option(
        '--include-ads', dest='include_ads',
        action='store_true',
        help='Download advertisements as well (experimental)')
    authentication.add_option('-u', '--username',
            dest='username', metavar='USERNAME', help='account username')
@ -716,6 +719,7 @@ def _real_main(argv=None):
        'bidi_workaround': opts.bidi_workaround,
        'debug_printtraffic': opts.debug_printtraffic,
        'prefer_ffmpeg': opts.prefer_ffmpeg,
        'include_ads': opts.include_ads,
    }
    with YoutubeDL(ydl_opts) as ydl:
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -152,6 +152,7 @@ from .rottentomatoes import RottenTomatoesIE
 from .roxwel import RoxwelIE
 from .rtlnow import RTLnowIE
 from .rutube import RutubeIE
 from .servingsys import ServingSysIE
 from .sina import SinaIE
 from .slashdot import SlashdotIE
 from .slideshare import SlideshareIE
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@ -9,9 +9,11 @@ from .common import InfoExtractor
 from ..utils import (
    compat_urllib_parse,
    find_xpath_attr,
    fix_xml_ampersands,
    compat_urlparse,
    compat_str,
    compat_urllib_request,
    compat_parse_qs,
    ExtractorError,
    unsmuggle_url,
@ -83,17 +85,30 @@ class BrightcoveIE(InfoExtractor):
                            lambda m: m.group(1) + '/>', object_str)
        # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608
        object_str = object_str.replace('<--', '<!--')
        object_str = fix_xml_ampersands(object_str)
        object_doc = xml.etree.ElementTree.fromstring(object_str)
-        assert 'BrightcoveExperience' in object_doc.attrib['class']
+
-        params = {
+        fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
-            'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
+        flashvars = dict(
-        }
+            (k, v[0])
            for k, v in compat_parse_qs(fv_el.attrib['value']).items())
        def find_param(name):
            if name in flashvars:
                return flashvars[name]
            node = find_xpath_attr(object_doc, './param', 'name', name)
            if node is not None:
                return node.attrib['value']
            return None
        params = {}
        playerID = find_param('playerID')
        if playerID is None:
            raise ExtractorError('Cannot find player ID')
        params['playerID'] = playerID
        playerKey = find_param('playerKey')
        # Not all pages define this value
        if playerKey is not None:
@ -114,8 +129,12 @@ class BrightcoveIE(InfoExtractor):
        if it can't be found
        """
        m_brightcove = re.search(
-            r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>',
+            r'''(?sx)<object
-            webpage, re.DOTALL)
+            (?:
                :[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1 |
                [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
            ).+?</object>''',
            webpage)
        if m_brightcove is not None:
            return cls._build_brighcove_url(m_brightcove.group())
        else:
@ -156,6 +175,7 @@ class BrightcoveIE(InfoExtractor):
        info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
        info = json.loads(info)['data']
        video_info = info['programmedContent']['videoPlayer']['mediaDTO']
        video_info['_youtubedl_adServerURL'] = info.get('adServerURL')
        return self._extract_video_info(video_info)
@ -193,6 +213,23 @@ class BrightcoveIE(InfoExtractor):
            info.update({
                'url': video_info['FLVFullLengthURL'],
            })
-        else:
+
        if self._downloader.params.get('include_ads', False):
            adServerURL = video_info.get('_youtubedl_adServerURL')
            if adServerURL:
                ad_info = {
                    '_type': 'url',
                    'url': adServerURL,
                }
                if 'url' in info:
                    return {
                        '_type': 'playlist',
                        'title': info['title'],
                        'entries': [ad_info, info],
                    }
                else:
                    return ad_info
        if 'url' not in info:
            raise ExtractorError('Unable to extract video url for %s' % info['id'])
        return info
--- a/youtube_dl/extractor/servingsys.py
+++ b/youtube_dl/extractor/servingsys.py
@ -0,0 +1,70 @@
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import (
    int_or_none,
 )
 class ServingSysIE(InfoExtractor):
    _VALID_URL = r'https?://(?:[^.]+\.)?serving-sys\.com/BurstingPipe/adServer\.bs\?.*?&pli=(?P<id>[0-9]+)'
    _TEST = {
        'url': 'http://bs.serving-sys.com/BurstingPipe/adServer.bs?cn=is&c=23&pl=VAST&pli=5349193&PluID=0&pos=7135&ord=[timestamp]&cim=1?',
        'playlist': [{
            'file': '29955898.flv',
            'md5': 'baed851342df6846eb8677a60a011a0f',
            'info_dict': {
                'title': 'AdAPPter_Hyundai_demo (1)',
                'duration': 74,
                'tbr': 1378,
                'width': 640,
                'height': 400,
            },
        }, {
            'file': '29907998.flv',
            'md5': '979b4da2655c4bc2d81aeb915a8c5014',
            'info_dict': {
                'title': 'AdAPPter_Hyundai_demo (2)',
                'duration': 34,
                'width': 854,
                'height': 480,
                'tbr': 516,
            },
        }],
        'params': {
            'playlistend': 2,
        }
    }
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        pl_id = mobj.group('id')
        vast_doc = self._download_xml(url, pl_id)
        title = vast_doc.find('.//AdTitle').text
        media = vast_doc.find('.//MediaFile').text
        info_url = self._search_regex(r'&adData=([^&]+)&', media, 'info URL')
        doc = self._download_xml(info_url, pl_id, 'Downloading video info')
        entries = [{
            '_type': 'video',
            'id': a.attrib['id'],
            'title': '%s (%s)' % (title, a.attrib['assetID']),
            'url': a.attrib['URL'],
            'duration': int_or_none(a.attrib.get('length')),
            'tbr': int_or_none(a.attrib.get('bitrate')),
            'height': int_or_none(a.attrib.get('height')),
            'width': int_or_none(a.attrib.get('width')),
        } for a in doc.findall('.//AdditionalAssets/asset')]
        return {
            '_type': 'playlist',
            'id': pl_id,
            'title': title,
            'entries': entries,
        }