[netzkino] Add new extractor (Fixes #4669)
This commit is contained in:
parent
b8da6b9fc6
commit
dd622d7c4e
|
@ -110,6 +110,20 @@ def expect_info_dict(self, got_dict, expected_dict):
|
||||||
else:
|
else:
|
||||||
if isinstance(expected, compat_str) and expected.startswith('md5:'):
|
if isinstance(expected, compat_str) and expected.startswith('md5:'):
|
||||||
got = 'md5:' + md5(got_dict.get(info_field))
|
got = 'md5:' + md5(got_dict.get(info_field))
|
||||||
|
elif isinstance(expected, compat_str) and expected.startswith('mincount:'):
|
||||||
|
got = got_dict.get(info_field)
|
||||||
|
self.assertTrue(
|
||||||
|
isinstance(got, list),
|
||||||
|
'Expected field %s to be a list, but it is of type %s' % (
|
||||||
|
info_field, type(got).__name__))
|
||||||
|
expected_num = int(expected.partition(':')[2])
|
||||||
|
assertGreaterEqual(
|
||||||
|
self, len(got), expected_num,
|
||||||
|
'Expected %d items in field %s, but only got %d' % (
|
||||||
|
expected_num, info_field, len(got)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
continue
|
||||||
else:
|
else:
|
||||||
got = got_dict.get(info_field)
|
got = got_dict.get(info_field)
|
||||||
self.assertEqual(expected, got,
|
self.assertEqual(expected, got,
|
||||||
|
|
|
@ -274,6 +274,7 @@ from .nbc import (
|
||||||
)
|
)
|
||||||
from .ndr import NDRIE
|
from .ndr import NDRIE
|
||||||
from .ndtv import NDTVIE
|
from .ndtv import NDTVIE
|
||||||
|
from .netzkino import NetzkinoIE
|
||||||
from .nerdcubed import NerdCubedFeedIE
|
from .nerdcubed import NerdCubedFeedIE
|
||||||
from .newgrounds import NewgroundsIE
|
from .newgrounds import NewgroundsIE
|
||||||
from .newstube import NewstubeIE
|
from .newstube import NewstubeIE
|
||||||
|
|
|
@ -147,6 +147,17 @@ class InfoExtractor(object):
|
||||||
like_count: Number of positive ratings of the video
|
like_count: Number of positive ratings of the video
|
||||||
dislike_count: Number of negative ratings of the video
|
dislike_count: Number of negative ratings of the video
|
||||||
comment_count: Number of comments on the video
|
comment_count: Number of comments on the video
|
||||||
|
comments: A list of comments, each with one or more of the following
|
||||||
|
properties (all but one of text or html optional):
|
||||||
|
* "author" - human-readable name of the comment author
|
||||||
|
* "author_id" - user ID of the comment author
|
||||||
|
* "id" - Comment ID
|
||||||
|
* "html" - Comment as HTML
|
||||||
|
* "text" - Plain text of the comment
|
||||||
|
* "timestamp" - UNIX timestamp of comment
|
||||||
|
* "parent" - ID of the comment this one is replying to.
|
||||||
|
Set to "root" to indicate that this is a
|
||||||
|
comment to the original video.
|
||||||
age_limit: Age restriction for the video, as an integer (years)
|
age_limit: Age restriction for the video, as an integer (years)
|
||||||
webpage_url: The url to the video webpage, if given to youtube-dl it
|
webpage_url: The url to the video webpage, if given to youtube-dl it
|
||||||
should allow to get the same result again. (It will be set
|
should allow to get the same result again. (It will be set
|
||||||
|
|
|
@ -0,0 +1,86 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from .common import InfoExtractor
|
||||||
|
from ..utils import (
|
||||||
|
clean_html,
|
||||||
|
int_or_none,
|
||||||
|
js_to_json,
|
||||||
|
parse_iso8601,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class NetzkinoIE(InfoExtractor):
|
||||||
|
_VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)'
|
||||||
|
|
||||||
|
_TEST = {
|
||||||
|
'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond',
|
||||||
|
'md5': '92a3f8b76f8d7220acce5377ea5d4873',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'rakete-zum-mond',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)',
|
||||||
|
'comments': 'mincount:3',
|
||||||
|
'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28',
|
||||||
|
'upload_date': '20120813',
|
||||||
|
'thumbnail': 're:https?://.*\.jpg$',
|
||||||
|
'timestamp': 1344858571,
|
||||||
|
'age_limit': 12,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
mobj = re.match(self._VALID_URL, url)
|
||||||
|
category_id = mobj.group('category')
|
||||||
|
video_id = mobj.group('id')
|
||||||
|
|
||||||
|
api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id
|
||||||
|
api_info = self._download_json(api_url, video_id)
|
||||||
|
info = next(
|
||||||
|
p for p in api_info['posts'] if p['slug'] == video_id)
|
||||||
|
custom_fields = info['custom_fields']
|
||||||
|
|
||||||
|
production_js = self._download_webpage(
|
||||||
|
'http://www.netzkino.de/beta/dist/production.min.js', video_id,
|
||||||
|
note='Downloading player code')
|
||||||
|
avo_js = self._search_regex(
|
||||||
|
r'window\.avoCore\s*=.*?urlTemplate:\s*(\{.*?"\})',
|
||||||
|
production_js, 'URL templates')
|
||||||
|
templates = self._parse_json(
|
||||||
|
avo_js, video_id, transform_source=js_to_json)
|
||||||
|
|
||||||
|
suffix = {
|
||||||
|
'hds': '.mp4/manifest.f4m',
|
||||||
|
'hls': '.mp4/master.m3u8',
|
||||||
|
'pmd': '.mp4',
|
||||||
|
}
|
||||||
|
film_fn = custom_fields['Streaming'][0]
|
||||||
|
formats = [{
|
||||||
|
'format_id': key,
|
||||||
|
'ext': 'mp4',
|
||||||
|
'url': tpl.replace('{}', film_fn) + suffix[key],
|
||||||
|
} for key, tpl in templates.items()]
|
||||||
|
self._sort_formats(formats)
|
||||||
|
|
||||||
|
comments = [{
|
||||||
|
'timestamp': parse_iso8601(c.get('date'), delimiter=' '),
|
||||||
|
'id': c['id'],
|
||||||
|
'author': c['name'],
|
||||||
|
'html': c['content'],
|
||||||
|
'parent': 'root' if c.get('parent', 0) == 0 else c['parent'],
|
||||||
|
} for c in info.get('comments', [])]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'id': video_id,
|
||||||
|
'formats': formats,
|
||||||
|
'comments': comments,
|
||||||
|
'title': info['title'],
|
||||||
|
'age_limit': int_or_none(custom_fields.get('FSK')[0]),
|
||||||
|
'timestamp': parse_iso8601(info.get('date'), delimiter=' '),
|
||||||
|
'description': clean_html(info.get('content')),
|
||||||
|
'thumbnail': info.get('thumbnail'),
|
||||||
|
'playlist_title': api_info.get('title'),
|
||||||
|
'playlist_id': category_id,
|
||||||
|
}
|
|
@ -205,6 +205,10 @@ def get_element_by_attribute(attribute, value, html):
|
||||||
|
|
||||||
def clean_html(html):
|
def clean_html(html):
|
||||||
"""Clean an HTML snippet into a readable string"""
|
"""Clean an HTML snippet into a readable string"""
|
||||||
|
|
||||||
|
if html is None: # Convenience for sanitizing descriptions etc.
|
||||||
|
return html
|
||||||
|
|
||||||
# Newline vs <br />
|
# Newline vs <br />
|
||||||
html = html.replace('\n', ' ')
|
html = html.replace('\n', ' ')
|
||||||
html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
|
html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
|
||||||
|
|
Loading…
Reference in New Issue