[youtube] Separate methods for embeds extraction

2017-09-06 00:48:37 +07:00 · 2017-09-06 00:48:37 +07:00 · 66c9fa36c1
parent c5c9bf0c12
commit 66c9fa36c1
2 changed files with 41 additions and 29 deletions
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -2243,36 +2243,11 @@ class GenericIE(InfoExtractor):
        if vid_me_embed_url is not None:
            return self.url_result(vid_me_embed_url, 'Vidme')
-        # Look for embedded YouTube player
+        # Look for YouTube embeds
-        matches = re.findall(r'''(?x)
+        youtube_urls = YoutubeIE._extract_urls(webpage)
-            (?:
+        if youtube_urls:
                <iframe[^>]+?src=|
                data-video-url=|
                <embed[^>]+?src=|
                embedSWF\(?:\s*|
                <object[^>]+data=|
                new\s+SWFObject\(
            )
            (["\'])
                (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
                (?:embed|v|p)/.+?)
            \1''', webpage)
        if matches:
            return self.playlist_from_matches(
-                matches, video_id, video_title, lambda m: unescapeHTML(m[1]))
+                youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key())
        # Look for lazyYT YouTube embed
        matches = re.findall(
            r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
        if matches:
            return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))
        # Look for Wordpress "YouTube Video Importer" plugin
        matches = re.findall(r'''(?x)<div[^>]+
            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
        if matches:
            return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])
        matches = DailymotionIE._extract_urls(webpage)
        if matches:
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@ -1374,6 +1374,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
            playback_url, video_id, 'Marking watched',
            'Unable to mark watched', fatal=False)
    @staticmethod
    def _extract_urls(webpage):
        # Embedded YouTube player
        entries = [
            unescapeHTML(mobj.group('url'))
            for mobj in re.finditer(r'''(?x)
            (?:
                <iframe[^>]+?src=|
                data-video-url=|
                <embed[^>]+?src=|
                embedSWF\(?:\s*|
                <object[^>]+data=|
                new\s+SWFObject\(
            )
            (["\'])
                (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
                (?:embed|v|p)/.+?)
            \1''', webpage)]
        # lazyYT YouTube embed
        entries.extend(list(map(
            unescapeHTML,
            re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
        # Wordpress "YouTube Video Importer" plugin
        matches = re.findall(r'''(?x)<div[^>]+
            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
        entries.extend(m[-1] for m in matches)
        return entries
    @staticmethod
    def _extract_url(webpage):
        urls = YoutubeIE._extract_urls(webpage)
        return urls[0] if urls else None
    @classmethod
    def extract_id(cls, url):
        mobj = re.match(cls._VALID_URL, url, re.VERBOSE)