Add CSpanIE (closes #312)

2013-06-26 17:55:54 +02:00 · 2013-06-26 17:55:54 +02:00 · aa0c87391c
parent 2e32528012
commit aa0c87391c
3 changed files with 56 additions and 0 deletions
--- a/test/tests.json
+++ b/test/tests.json
@ -695,5 +695,15 @@
    "info_dict": {
        "title": "卡马乔国足开大脚长传冲吊集锦"
    }
  },
  {
    "name": "CSpan",
    "url": "http://www.c-spanvideo.org/program/HolderonV",
    "file": "315139.flv",
    "md5": "74a623266956f69e4df0068ab6c80fe4",
    "info_dict": {
        "title": "Attorney General Eric Holder on Voting Rights Act Decision"
    },
    "skip": "Requires rtmpdump"
  }
 ]
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -6,6 +6,7 @@ from .bliptv import BlipTVIE, BlipTVUserIE
 from .breakcom import BreakIE
 from .collegehumor import CollegeHumorIE
 from .comedycentral import ComedyCentralIE
 from .cspan import CSpanIE
 from .dailymotion import DailymotionIE
 from .depositfiles import DepositFilesIE
 from .eighttracks import EightTracksIE
@ -132,6 +133,7 @@ def gen_extractors():
        VevoIE(),
        JukeboxIE(),
        TudouIE(),
        CSpanIE(),
        GenericIE()
    ]
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@ -0,0 +1,44 @@
 import re
 from .common import InfoExtractor
 from ..utils import (
    compat_urllib_parse,
 )
 class CSpanIE(InfoExtractor):
    _VALID_URL = r'http://www.c-spanvideo.org/program/(.*)'
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        prog_name = mobj.group(1)
        webpage = self._download_webpage(url, prog_name)
        video_id = self._search_regex(r'programid=(.*?)&', webpage, 'video id')
        data = compat_urllib_parse.urlencode({'programid': video_id,
                                              'dynamic':'1'})
        info_url = 'http://www.c-spanvideo.org/common/services/flashXml.php?' + data
        video_info = self._download_webpage(info_url, video_id, u'Downloading video info')
        self.report_extraction(video_id)
        title = self._html_search_regex(r'<string name="title">(.*?)</string>',
                                        video_info, 'title')
        description = self._html_search_regex(r'<meta (?:property="og:|name=")description" content="(.*?)"',
                                              webpage, 'description',
                                              flags=re.MULTILINE|re.DOTALL)
        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.*?)"',
                                            webpage, 'thumbnail')
        url = self._search_regex(r'<string name="URL">(.*?)</string>',
                                 video_info, 'video url')
        url = url.replace('$(protocol)', 'rtmp').replace('$(port)', '443')
        path = self._search_regex(r'<string name="path">(.*?)</string>',
                            video_info, 'rtmp play path')
        return {'id': video_id,
                'title': title,
                'ext': 'flv',
                'url': url,
                'play_path': path,
                'description': description,
                'thumbnail': thumbnail,
                }