[cinchcast] Add new extractor (Fixes #4428)

2014-12-12 02:57:36 +01:00 · 2014-12-12 02:57:36 +01:00 · 42bdd9d051
parent 4e40de6e2a
commit 42bdd9d051
5 changed files with 88 additions and 6 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -144,6 +144,9 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
        self.assertEqual(unified_strdate('1968-12-10'), '19681210')
        self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128')
        self.assertEqual(
            unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False),
            '20141126')
    def test_find_xpath_attr(self):
        testxml = '''<root>
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -51,6 +51,7 @@ from .cbsnews import CBSNewsIE
 from .ceskatelevize import CeskaTelevizeIE
 from .channel9 import Channel9IE
 from .chilloutzone import ChilloutzoneIE
 from .cinchcast import CinchcastIE
 from .clipfish import ClipfishIE
 from .cliphunter import CliphunterIE
 from .clipsyndicate import ClipsyndicateIE
--- a/youtube_dl/extractor/cinchcast.py
+++ b/youtube_dl/extractor/cinchcast.py
@ -0,0 +1,53 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import (
    int_or_none,
    unified_strdate,
    xpath_text,
 )
 class CinchcastIE(InfoExtractor):
    _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)'
    _TEST = {
        # Actual test is run in generic, look for undergroundwellness
        'url': 'http://player.cinchcast.com/?platformId=1&#038;assetType=single&#038;assetId=7141703',
        'only_matching': True,
    }
    def _real_extract(self, url):
        video_id = self._match_id(url)
        doc = self._download_xml(
            'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id,
            video_id)
        item = doc.find('.//item')
        title = xpath_text(item, './title', fatal=True)
        date_str = xpath_text(
            item, './{http://developer.longtailvideo.com/trac/}date')
        upload_date = unified_strdate(date_str, day_first=False)
        # duration is present but wrong
        formats = []
        formats.append({
            'format_id': 'main',
            'url': item.find(
                './{http://search.yahoo.com/mrss/}content').attrib['url'],
        })
        backup_url = xpath_text(
            item, './{http://developer.longtailvideo.com/trac/}backupContent')
        if backup_url:
            formats.append({
                'preference': 2,  # seems to be more reliable
                'format_id': 'backup',
                'url': backup_url,
            })
        self._sort_formats(formats)
        return {
            'id': video_id,
            'title': title,
            'upload_date': upload_date,
            'formats': formats,
        }
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -467,8 +467,17 @@ class GenericIE(InfoExtractor):
            'expected_warnings': [
                'URL could be a direct video link, returning it as such.'
            ]
-        }
+        },
-
+        # Cinchcast embed
        {
            'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
            'info_dict': {
                'id': '7141703',
                'ext': 'mp3',
                'upload_date': '20141126',
                'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
            }
        },
    ]
    def report_following_redirect(self, new_url):
@ -962,6 +971,13 @@ class GenericIE(InfoExtractor):
        if mobj is not None:
            return self.url_result(mobj.group('url'), 'SBS')
        # Look for embedded Cinchcast player
        mobj = re.search(
            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
            webpage)
        if mobj is not None:
            return self.url_result(mobj.group('url'), 'Cinchcast')
        mobj = re.search(
            r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
            webpage)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -166,7 +166,7 @@ def xpath_text(node, xpath, name=None, fatal=False):
        xpath = xpath.encode('ascii')
    n = node.find(xpath)
-    if n is None:
+    if n is None or n.text is None:
        if fatal:
            name = xpath if name is None else name
            raise ExtractorError('Could not find XML element %s' % name)
@ -644,17 +644,19 @@ def parse_iso8601(date_str, delimiter='T'):
    return calendar.timegm(dt.timetuple())
-def unified_strdate(date_str):
+def unified_strdate(date_str, day_first=True):
    """Return a string with the date in the format YYYYMMDD"""
    if date_str is None:
        return None
    upload_date = None
    # Replace commas
    date_str = date_str.replace(',', ' ')
    # %z (UTC offset) is only supported in python>=3.2
    date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
    # Remove AM/PM + timezone
    date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
    format_expressions = [
        '%d %B %Y',
        '%d %b %Y',
@ -669,7 +671,6 @@ def unified_strdate(date_str):
        '%d/%m/%Y',
        '%d/%m/%y',
        '%Y/%m/%d %H:%M:%S',
        '%d/%m/%Y %H:%M:%S',
        '%Y-%m-%d %H:%M:%S',
        '%Y-%m-%d %H:%M:%S.%f',
        '%d.%m.%Y %H:%M',
@ -681,6 +682,14 @@ def unified_strdate(date_str):
        '%Y-%m-%dT%H:%M:%S.%f',
        '%Y-%m-%dT%H:%M',
    ]
    if day_first:
        format_expressions.extend([
            '%d/%m/%Y %H:%M:%S',
        ])
    else:
        format_expressions.extend([
            '%m/%d/%Y %H:%M:%S',
        ])
    for expression in format_expressions:
        try:
            upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')