[extractor/common] Add the encoding parameter
The QQMusic info extractor need forced encoding for correct working.
This commit is contained in:
parent
a685ae511a
commit
c9a779695d
|
@ -324,7 +324,7 @@ class InfoExtractor(object):
|
||||||
self._downloader.report_warning(errmsg)
|
self._downloader.report_warning(errmsg)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
|
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
|
||||||
""" Returns a tuple (page content as string, URL handle) """
|
""" Returns a tuple (page content as string, URL handle) """
|
||||||
# Strip hashes from the URL (#1038)
|
# Strip hashes from the URL (#1038)
|
||||||
if isinstance(url_or_request, (compat_str, str)):
|
if isinstance(url_or_request, (compat_str, str)):
|
||||||
|
@ -334,14 +334,11 @@ class InfoExtractor(object):
|
||||||
if urlh is False:
|
if urlh is False:
|
||||||
assert not fatal
|
assert not fatal
|
||||||
return False
|
return False
|
||||||
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
|
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
|
||||||
return (content, urlh)
|
return (content, urlh)
|
||||||
|
|
||||||
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
|
@staticmethod
|
||||||
content_type = urlh.headers.get('Content-Type', '')
|
def _guess_encoding_from_content(content_type, webpage_bytes):
|
||||||
webpage_bytes = urlh.read()
|
|
||||||
if prefix is not None:
|
|
||||||
webpage_bytes = prefix + webpage_bytes
|
|
||||||
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
|
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
|
||||||
if m:
|
if m:
|
||||||
encoding = m.group(1)
|
encoding = m.group(1)
|
||||||
|
@ -354,6 +351,16 @@ class InfoExtractor(object):
|
||||||
encoding = 'utf-16'
|
encoding = 'utf-16'
|
||||||
else:
|
else:
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
|
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
|
||||||
|
content_type = urlh.headers.get('Content-Type', '')
|
||||||
|
webpage_bytes = urlh.read()
|
||||||
|
if prefix is not None:
|
||||||
|
webpage_bytes = prefix + webpage_bytes
|
||||||
|
if not encoding:
|
||||||
|
encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
|
||||||
if self._downloader.params.get('dump_intermediate_pages', False):
|
if self._downloader.params.get('dump_intermediate_pages', False):
|
||||||
try:
|
try:
|
||||||
url = url_or_request.get_full_url()
|
url = url_or_request.get_full_url()
|
||||||
|
@ -410,13 +417,13 @@ class InfoExtractor(object):
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
|
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
|
||||||
""" Returns the data of the page as a string """
|
""" Returns the data of the page as a string """
|
||||||
success = False
|
success = False
|
||||||
try_count = 0
|
try_count = 0
|
||||||
while success is False:
|
while success is False:
|
||||||
try:
|
try:
|
||||||
res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
|
res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
|
||||||
success = True
|
success = True
|
||||||
except compat_http_client.IncompleteRead as e:
|
except compat_http_client.IncompleteRead as e:
|
||||||
try_count += 1
|
try_count += 1
|
||||||
|
@ -431,10 +438,10 @@ class InfoExtractor(object):
|
||||||
|
|
||||||
def _download_xml(self, url_or_request, video_id,
|
def _download_xml(self, url_or_request, video_id,
|
||||||
note='Downloading XML', errnote='Unable to download XML',
|
note='Downloading XML', errnote='Unable to download XML',
|
||||||
transform_source=None, fatal=True):
|
transform_source=None, fatal=True, encoding=None):
|
||||||
"""Return the xml as an xml.etree.ElementTree.Element"""
|
"""Return the xml as an xml.etree.ElementTree.Element"""
|
||||||
xml_string = self._download_webpage(
|
xml_string = self._download_webpage(
|
||||||
url_or_request, video_id, note, errnote, fatal=fatal)
|
url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
|
||||||
if xml_string is False:
|
if xml_string is False:
|
||||||
return xml_string
|
return xml_string
|
||||||
if transform_source:
|
if transform_source:
|
||||||
|
@ -445,9 +452,10 @@ class InfoExtractor(object):
|
||||||
note='Downloading JSON metadata',
|
note='Downloading JSON metadata',
|
||||||
errnote='Unable to download JSON metadata',
|
errnote='Unable to download JSON metadata',
|
||||||
transform_source=None,
|
transform_source=None,
|
||||||
fatal=True):
|
fatal=True, encoding=None):
|
||||||
json_string = self._download_webpage(
|
json_string = self._download_webpage(
|
||||||
url_or_request, video_id, note, errnote, fatal=fatal)
|
url_or_request, video_id, note, errnote, fatal=fatal,
|
||||||
|
encoding=encoding)
|
||||||
if (not fatal) and json_string is False:
|
if (not fatal) and json_string is False:
|
||||||
return None
|
return None
|
||||||
return self._parse_json(
|
return self._parse_json(
|
||||||
|
|
|
@ -24,7 +24,7 @@ class QQMusicIE(InfoExtractor):
|
||||||
'title': '可惜没如果',
|
'title': '可惜没如果',
|
||||||
'upload_date': '20141227',
|
'upload_date': '20141227',
|
||||||
'creator': '林俊杰',
|
'creator': '林俊杰',
|
||||||
'description': 'md5:242c97c2847e0495583b7b13764f7106',
|
'description': 'md5:4348ff1dd24036906baa7b6f973f8d30',
|
||||||
}
|
}
|
||||||
}]
|
}]
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ class QQMusicIE(InfoExtractor):
|
||||||
detail_info_page = self._download_webpage(
|
detail_info_page = self._download_webpage(
|
||||||
'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid,
|
'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid,
|
||||||
mid, note='Download song detail info',
|
mid, note='Download song detail info',
|
||||||
errnote='Unable to get song detail info')
|
errnote='Unable to get song detail info', encoding='gbk')
|
||||||
|
|
||||||
song_name = self._html_search_regex(
|
song_name = self._html_search_regex(
|
||||||
r"songname:\s*'([^']+)'", detail_info_page, 'song name')
|
r"songname:\s*'([^']+)'", detail_info_page, 'song name')
|
||||||
|
|
Loading…
Reference in New Issue