common.py 138 KB
Newer Older
1
# coding: utf-8
2
from __future__ import unicode_literals
3

4
import base64
5
import datetime
6
import hashlib
7
import json
8
import netrc
9
import os
10
import random
11
12
import re
import socket
13
import ssl
14
import sys
15
import time
16
import math
17

18
from ..compat import (
19
    compat_cookiejar_Cookie,
20
    compat_cookies,
Sergey M․'s avatar
Sergey M․ committed
21
    compat_etree_Element,
22
    compat_etree_fromstring,
23
    compat_getpass,
24
    compat_integer_types,
25
    compat_http_client,
26
27
    compat_os_name,
    compat_str,
28
    compat_urllib_error,
29
    compat_urllib_parse_unquote,
30
    compat_urllib_parse_urlencode,
31
    compat_urllib_request,
32
    compat_urlparse,
33
    compat_xml_parse_error,
34
)
35
36
37
38
from ..downloader.f4m import (
    get_base_url,
    remove_encrypted_media,
)
39
from ..utils import (
40
    NO_DEFAULT,
41
    age_restricted,
Sergey M․'s avatar
Sergey M․ committed
42
    base_url,
43
    bug_reports_message,
44
45
    clean_html,
    compiled_regex_type,
46
    determine_ext,
47
    determine_protocol,
48
    dict_get,
49
    error_to_compat_str,
50
    ExtractorError,
51
    extract_attributes,
52
    fix_xml_ampersands,
Philipp Hagemeister's avatar
Philipp Hagemeister committed
53
    float_or_none,
54
55
    GeoRestrictedError,
    GeoUtils,
56
    int_or_none,
57
    js_to_json,
Sergey M․'s avatar
Sergey M․ committed
58
    JSON_LD_RE,
59
60
    mimetype2ext,
    orderedSet,
61
    parse_bitrate,
62
63
    parse_codecs,
    parse_duration,
64
    parse_iso8601,
65
    parse_m3u8_attributes,
66
    parse_resolution,
67
    RegexNotFoundError,
68
    sanitized_Request,
69
    sanitize_filename,
70
    str_or_none,
71
    str_to_int,
72
    strip_or_none,
73
    unescapeHTML,
74
    unified_strdate,
75
    unified_timestamp,
76
77
78
    update_Request,
    update_url_query,
    urljoin,
79
    url_basename,
80
    url_or_none,
81
    xpath_element,
82
83
    xpath_text,
    xpath_with_ns,
84
)
85

86
87
88
89
90
91
92
93

class InfoExtractor(object):
    """Information Extractor class.

    Information extractors are the classes that, given a URL, extract
    information about the video (or videos) the URL refers to. This
    information includes the real video URL, the video title, author and
    others. The information is stored in a dictionary which is then
94
    passed to the YoutubeDL. The YoutubeDL processes this
95
96
97
    information possibly downloading the video to the file system, among
    other possible outcomes.

zouhair's avatar
zouhair committed
98
    The type field determines the type of the result.
99
100
101
102
    By far the most common value (and the default if _type is missing) is
    "video", which indicates a single video.

    For a video, the dictionaries must include the following fields:
103
104
105

    id:             Video identifier.
    title:          Video title, unescaped.
106

107
    Additionally, it must contain either a formats entry or a url one:
108

109
110
111
112
    formats:        A list of dictionaries for each format available, ordered
                    from worst to best quality.

                    Potential fields:
113
114
115
116
117
                    * url        The mandatory URL representing the media:
                                   for plain file media - HTTP URL of this file,
                                   for RTMP - RTMP URL,
                                   for HLS - URL of the M3U8 media playlist,
                                   for HDS - URL of the F4M manifest,
118
119
120
121
122
                                   for DASH
                                     - HTTP URL to plain file media (in case of
                                       unfragmented media)
                                     - URL of the MPD manifest or base URL
                                       representing the media if MPD manifest
Sergey M․'s avatar
Sergey M․ committed
123
                                       is parsed from a string (in case of
124
                                       fragmented media)
125
                                   for MSS - URL of the ISM manifest.
126
127
                    * manifest_url
                                 The URL of the manifest file in case of
128
129
130
131
132
                                 fragmented media:
                                   for HLS - URL of the M3U8 master playlist,
                                   for HDS - URL of the F4M manifest,
                                   for DASH - URL of the MPD manifest,
                                   for MSS - URL of the ISM manifest.
133
                    * ext        Will be calculated from URL if missing
134
135
136
137
138
                    * format     A human-readable description of the format
                                 ("mp4 container with h264/opus").
                                 Calculated from the format_id, width, height.
                                 and format_note fields if missing.
                    * format_id  A short description of the format
139
140
                                 ("mp4_h264_opus" or "19").
                                Technically optional, but strongly recommended.
141
142
143
144
                    * format_note Additional info about the format
                                 ("3D" or "DASH video")
                    * width      Width of the video, if known
                    * height     Height of the video, if known
145
                    * resolution Textual description of width and height
146
                    * tbr        Average bitrate of audio and video in KBit/s
147
148
                    * abr        Average audio bitrate in KBit/s
                    * acodec     Name of the audio codec in use
149
                    * asr        Audio sampling rate in Hertz
150
                    * vbr        Average video bitrate in KBit/s
151
                    * fps        Frame rate
152
                    * vcodec     Name of the video codec in use
153
                    * container  Name of the container format
154
                    * filesize   The number of bytes, if known in advance
155
                    * filesize_approx  An estimate for the number of bytes
156
                    * player_url SWF Player URL (used for rtmpdump).
157
158
                    * protocol   The protocol that will be used for the actual
                                 download, lower-case.
159
                                 "http", "https", "rtsp", "rtmp", "rtmpe",
160
                                 "m3u8", "m3u8_native" or "http_dash_segments".
161
162
163
164
165
166
167
168
169
170
171
172
173
                    * fragment_base_url
                                 Base URL for fragments. Each fragment's path
                                 value (if present) will be relative to
                                 this URL.
                    * fragments  A list of fragments of a fragmented media.
                                 Each fragment entry must contain either an url
                                 or a path. If an url is present it should be
                                 considered by a client. Otherwise both path and
                                 fragment_base_url must be present. Here is
                                 the list of all potential fields:
                                 * "url" - fragment's URL
                                 * "path" - fragment's path relative to
                                            fragment_base_url
174
175
                                 * "duration" (optional, int or float)
                                 * "filesize" (optional, int)
176
                    * preference Order number of this format. If this field is
177
                                 present and not None, the formats get sorted
178
                                 by this field, regardless of all other values.
179
180
                                 -1 for default (order by other properties),
                                 -2 or smaller for less than default.
181
182
                                 < -1000 to hide the format (if there is
                                    another one which is strictly better)
183
184
185
                    * language   Language code, e.g. "de" or "en-US".
                    * language_preference  Is this in the language mentioned in
                                 the URL?
186
187
188
                                 10 if it's what the URL is about,
                                 -1 for default (don't know),
                                 -10 otherwise, other values reserved for now.
189
190
191
192
                    * quality    Order number of the video quality of this
                                 format, irrespective of the file format.
                                 -1 for default (order by other properties),
                                 -2 or smaller for less than default.
Philipp Hagemeister's avatar
Philipp Hagemeister committed
193
194
195
196
                    * source_preference  Order number for this video source
                                  (quality takes higher priority)
                                 -1 for default (order by other properties),
                                 -2 or smaller for less than default.
197
198
                    * http_headers  A dictionary of additional HTTP headers
                                 to add to the request.
199
                    * stretched_ratio  If given and not 1, indicates that the
200
201
202
203
                                 video's pixels are not square.
                                 width : height ratio as float.
                    * no_resume  The server does not support resuming the
                                 (HTTP or RTMP) download. Boolean.
204
205
                    * downloader_options  A dictionary of downloader options as
                                 described in FileDownloader
206

Philipp Hagemeister's avatar
Philipp Hagemeister committed
207
    url:            Final video URL.
208
    ext:            Video filename extension.
209
210
    format:         The video format, defaults to ext (used for --get-format)
    player_url:     SWF Player URL (used for rtmpdump).
211

212
213
    The following fields are optional:

214
    alt_title:      A secondary title of the video.
Philipp Hagemeister's avatar
Philipp Hagemeister committed
215
216
217
218
    display_id      An alternative identifier for the video, not necessarily
                    unique, but available before title. Typically, id is
                    something like "4234987", title "Dancing naked mole rats",
                    and display_id "dancing-naked-mole-rats"
219
    thumbnails:     A list of dictionaries, with the following entries:
Philipp Hagemeister's avatar
Philipp Hagemeister committed
220
                        * "id" (optional, string) - Thumbnail format ID
221
                        * "url"
Philipp Hagemeister's avatar
Philipp Hagemeister committed
222
                        * "preference" (optional, int) - quality of the image
223
224
                        * "width" (optional, int)
                        * "height" (optional, int)
225
                        * "resolution" (optional, string "{width}x{height}",
226
                                        deprecated)
227
                        * "filesize" (optional, int)
228
    thumbnail:      Full URL to a video thumbnail image.
229
    description:    Full video description.
230
    uploader:       Full name of the video uploader.
231
    license:        License name the video is licensed under.
232
    creator:        The creator of the video.
233
    release_date:   The date (YYYYMMDD) when the video was released.
234
    timestamp:      UNIX timestamp of the moment the video became available.
235
    upload_date:    Video upload date (YYYYMMDD).
236
                    If not explicitly set, calculated from timestamp.
237
    uploader_id:    Nickname or id of the video uploader.
238
    uploader_url:   Full URL to a personal webpage of the video uploader.
239
    channel:        Full name of the channel the video is uploaded on.
Sergey M․'s avatar
Sergey M․ committed
240
                    Note that channel fields may or may not repeat uploader
241
242
243
                    fields. This depends on a particular extractor.
    channel_id:     Id of the channel.
    channel_url:    Full URL to a channel webpage.
244
    location:       Physical location where the video was filmed.
245
    subtitles:      The available subtitles as a dictionary in the format
246
247
248
249
                    {tag: subformats}. "tag" is usually a language code, and
                    "subformats" is a list sorted from lower to higher
                    preference, each element is a dictionary with the "ext"
                    entry and one of:
250
                        * "data": The subtitles file contents
251
                        * "url": A URL pointing to the subtitles file
252
                    "ext" will be calculated from URL if missing
253
254
    automatic_captions: Like 'subtitles', used by the YoutubeIE for
                    automatically generated captions
255
    duration:       Length of the video in seconds, as an integer or float.
256
    view_count:     How many users have watched the video on the platform.
257
258
    like_count:     Number of positive ratings of the video
    dislike_count:  Number of negative ratings of the video
259
    repost_count:   Number of reposts of the video
260
    average_rating: Average rating give by users, the scale used depends on the webpage
261
    comment_count:  Number of comments on the video
262
263
264
265
266
267
268
269
270
271
272
    comments:       A list of comments, each with one or more of the following
                    properties (all but one of text or html optional):
                        * "author" - human-readable name of the comment author
                        * "author_id" - user ID of the comment author
                        * "id" - Comment ID
                        * "html" - Comment as HTML
                        * "text" - Plain text of the comment
                        * "timestamp" - UNIX timestamp of comment
                        * "parent" - ID of the comment this one is replying to.
                                     Set to "root" to indicate that this is a
                                     comment to the original video.
273
    age_limit:      Age restriction for the video, as an integer (years)
274
    webpage_url:    The URL to the video webpage, if given to youtube-dl it
275
276
                    should allow to get the same result again. (It will be set
                    by YoutubeDL if it's missing)
277
278
    categories:     A list of categories that the video falls in, for example
                    ["Sports", "Berlin"]
279
    tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
280
281
    is_live:        True, False, or None (=unknown). Whether this video is a
                    live stream that goes on instead of a fixed-length video.
282
    start_time:     Time in seconds where the reproduction should start, as
283
                    specified in the URL.
284
    end_time:       Time in seconds where the reproduction should end, as
285
                    specified in the URL.
remitamine's avatar
remitamine committed
286
287
288
289
    chapters:       A list of dictionaries, with the following entries:
                        * "start_time" - The start time of the chapter in seconds
                        * "end_time" - The end time of the chapter in seconds
                        * "title" (optional, string)
290

291
292
293
294
    The following fields should only be used when the video belongs to some logical
    chapter or section:

    chapter:        Name or title of the chapter the video belongs to.
295
296
    chapter_number: Number of the chapter the video belongs to, as an integer.
    chapter_id:     Id of the chapter the video belongs to, as a unicode string.
297
298

    The following fields should only be used when the video is an episode of some
299
    series, programme or podcast:
300
301
302

    series:         Title of the series or programme the video episode belongs to.
    season:         Title of the season the video episode belongs to.
303
304
    season_number:  Number of the season the video episode belongs to, as an integer.
    season_id:      Id of the season the video episode belongs to, as a unicode string.
305
306
307
    episode:        Title of the video episode. Unlike mandatory video title field,
                    this field should denote the exact title of the video episode
                    without any kind of decoration.
308
309
    episode_number: Number of the video episode within a season, as an integer.
    episode_id:     Id of the video episode, as a unicode string.
310

311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
    The following fields should only be used when the media is a track or a part of
    a music album:

    track:          Title of the track.
    track_number:   Number of the track within an album or a disc, as an integer.
    track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
                    as a unicode string.
    artist:         Artist(s) of the track.
    genre:          Genre(s) of the track.
    album:          Title of the album the track belongs to.
    album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
    album_artist:   List of all artists appeared on the album (e.g.
                    "Ash Borer / Fell Voices" or "Various Artists", useful for splits
                    and compilations).
    disc_number:    Number of the disc or other physical medium the track belongs to,
                    as an integer.
    release_year:   Year (YYYY) when the album was released.

329
    Unless mentioned otherwise, the fields should be Unicode strings.
330

331
332
    Unless mentioned otherwise, None is equivalent to absence of information.

333
334

    _type "playlist" indicates multiple videos.
335
336
    There must be a key "entries", which is a list, an iterable, or a PagedList
    object, each element of which is a valid dictionary by this specification.
337

338
339
340
    Additionally, playlists can have "id", "title", "description", "uploader",
    "uploader_id", "uploader_url" attributes with the same semantics as videos
    (see above).
341
342
343
344
345
346
347
348
349
350
351


    _type "multi_video" indicates that there are multiple videos that
    form a single show, for examples multiple acts of an opera or TV episode.
    It must have an entries key like a playlist and contain all the keys
    required for a video at the same time.


    _type "url" indicates that the video must be extracted from another
    location, possibly by a different extractor. Its only required key is:
    "url" - the next URL to extract.
352
353
354
355
    The key "ie_key" can be set to the class name (minus the trailing "IE",
    e.g. "Youtube") if the extractor class is known in advance.
    Additionally, the dictionary may have any properties of the resolved entity
    known in advance, for example "title" if the title of the referred video is
356
357
358
359
360
361
362
363
364
365
366
    known ahead of time.


    _type "url_transparent" entities have the same specification as "url", but
    indicate that the given additional information is more precise than the one
    associated with the resolved URL.
    This is useful when a site employs a video service that hosts the video and
    its technical metadata, but that video service does not embed a useful
    title, description etc.


367
368
369
370
    Subclasses of this one should re-define the _real_initialize() and
    _real_extract() methods and define a _VALID_URL regexp.
    Probably, they should also be added to the list of extractors.

Sergey M․'s avatar
Sergey M․ committed
371
    _GEO_BYPASS attribute may be set to False in order to disable
372
373
    geo restriction bypass mechanisms for a particular extractor.
    Though it won't disable explicit geo restriction bypass based on
374
    country code provided with geo_bypass_country.
Sergey M․'s avatar
Sergey M․ committed
375
376
377
378

    _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
    countries for this extractor. One of these countries will be used by
    geo restriction bypass mechanism right away in order to bypass
379
    geo restriction, of course, if the mechanism is not disabled.
380

Sergey M․'s avatar
Sergey M․ committed
381
382
383
    _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
    IP blocks in CIDR notation for this extractor. One of these IP blocks
    will be used by geo restriction bypass mechanism similarly
384
    to _GEO_COUNTRIES.
385

386
387
388
389
390
391
    Finally, the _WORKING attribute should be set to False for broken IEs
    in order to warn the users and skip the tests.
    """

    _ready = False
    _downloader = None
392
    _x_forwarded_for_ip = None
Sergey M․'s avatar
Sergey M․ committed
393
394
    _GEO_BYPASS = True
    _GEO_COUNTRIES = None
Sergey M․'s avatar
Sergey M․ committed
395
    _GEO_IP_BLOCKS = None
396
397
398
399
400
    _WORKING = True

    def __init__(self, downloader=None):
        """Constructor. Receives an optional downloader."""
        self._ready = False
401
        self._x_forwarded_for_ip = None
402
403
404
405
406
        self.set_downloader(downloader)

    @classmethod
    def suitable(cls, url):
        """Receives a URL and returns True if suitable for this IE."""
407
408
409
410
411
412
413

        # This does not use has/getattr intentionally - we want to know whether
        # we have cached the regexp for *this* class, whereas getattr would also
        # match the superclass
        if '_VALID_URL_RE' not in cls.__dict__:
            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
        return cls._VALID_URL_RE.match(url) is not None
414

415
416
417
418
419
420
    @classmethod
    def _match_id(cls, url):
        if '_VALID_URL_RE' not in cls.__dict__:
            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
        m = cls._VALID_URL_RE.match(url)
        assert m
421
        return compat_str(m.group('id'))
422

423
424
425
426
427
428
429
    @classmethod
    def working(cls):
        """Getter method for _WORKING."""
        return cls._WORKING

    def initialize(self):
        """Initializes an instance (authentication, etc)."""
Sergey M․'s avatar
Sergey M․ committed
430
431
432
433
        self._initialize_geo_bypass({
            'countries': self._GEO_COUNTRIES,
            'ip_blocks': self._GEO_IP_BLOCKS,
        })
Sergey M․'s avatar
Sergey M․ committed
434
435
436
437
        if not self._ready:
            self._real_initialize()
            self._ready = True

Sergey M․'s avatar
Sergey M․ committed
438
    def _initialize_geo_bypass(self, geo_bypass_context):
439
440
441
442
443
        """
        Initialize geo restriction bypass mechanism.

        This method is used to initialize geo bypass mechanism based on faking
        X-Forwarded-For HTTP header. A random country from provided country list
Sergey M․'s avatar
Sergey M․ committed
444
        is selected and a random IP belonging to this country is generated. This
445
446
447
448
        IP will be passed as X-Forwarded-For HTTP header in all subsequent
        HTTP requests.

        This method will be used for initial geo bypass mechanism initialization
Sergey M․'s avatar
Sergey M․ committed
449
450
        during the instance initialization with _GEO_COUNTRIES and
        _GEO_IP_BLOCKS.
451

Sergey M․'s avatar
Sergey M․ committed
452
        You may also manually call it from extractor's code if geo bypass
453
        information is not available beforehand (e.g. obtained during
Sergey M․'s avatar
Sergey M․ committed
454
455
456
457
458
459
460
461
462
        extraction) or due to some other reason. In this case you should pass
        this information in geo bypass context passed as first argument. It may
        contain following fields:

        countries:  List of geo unrestricted countries (similar
                    to _GEO_COUNTRIES)
        ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
                    (similar to _GEO_IP_BLOCKS)

463
        """
464
        if not self._x_forwarded_for_ip:
Sergey M․'s avatar
Sergey M․ committed
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520

            # Geo bypass mechanism is explicitly disabled by user
            if not self._downloader.params.get('geo_bypass', True):
                return

            if not geo_bypass_context:
                geo_bypass_context = {}

            # Backward compatibility: previously _initialize_geo_bypass
            # expected a list of countries, some 3rd party code may still use
            # it this way
            if isinstance(geo_bypass_context, (list, tuple)):
                geo_bypass_context = {
                    'countries': geo_bypass_context,
                }

            # The whole point of geo bypass mechanism is to fake IP
            # as X-Forwarded-For HTTP header based on some IP block or
            # country code.

            # Path 1: bypassing based on IP block in CIDR notation

            # Explicit IP block specified by user, use it right away
            # regardless of whether extractor is geo bypassable or not
            ip_block = self._downloader.params.get('geo_bypass_ip_block', None)

            # Otherwise use random IP block from geo bypass context but only
            # if extractor is known as geo bypassable
            if not ip_block:
                ip_blocks = geo_bypass_context.get('ip_blocks')
                if self._GEO_BYPASS and ip_blocks:
                    ip_block = random.choice(ip_blocks)

            if ip_block:
                self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
                if self._downloader.params.get('verbose', False):
                    self._downloader.to_screen(
                        '[debug] Using fake IP %s as X-Forwarded-For.'
                        % self._x_forwarded_for_ip)
                return

            # Path 2: bypassing based on country code

            # Explicit country code specified by user, use it right away
            # regardless of whether extractor is geo bypassable or not
            country = self._downloader.params.get('geo_bypass_country', None)

            # Otherwise use random country code from geo bypass context but
            # only if extractor is known as geo bypassable
            if not country:
                countries = geo_bypass_context.get('countries')
                if self._GEO_BYPASS and countries:
                    country = random.choice(countries)

            if country:
                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
Sergey M․'s avatar
Sergey M․ committed
521
                if self._downloader.params.get('verbose', False):
522
                    self._downloader.to_screen(
523
                        '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
Sergey M․'s avatar
Sergey M․ committed
524
                        % (self._x_forwarded_for_ip, country.upper()))
525
526
527

    def extract(self, url):
        """Extracts URL information and returns it in list of dicts."""
528
        try:
529
530
531
            for _ in range(2):
                try:
                    self.initialize()
532
533
534
535
                    ie_result = self._real_extract(url)
                    if self._x_forwarded_for_ip:
                        ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
                    return ie_result
536
                except GeoRestrictedError as e:
Sergey M․'s avatar
Sergey M․ committed
537
538
                    if self.__maybe_fake_ip_and_retry(e.countries):
                        continue
539
                    raise
540
541
542
        except ExtractorError:
            raise
        except compat_http_client.IncompleteRead as e:
Jakub Wilk's avatar
Jakub Wilk committed
543
            raise ExtractorError('A network error has occurred.', cause=e, expected=True)
544
        except (KeyError, StopIteration) as e:
Jakub Wilk's avatar
Jakub Wilk committed
545
            raise ExtractorError('An extractor error has occurred.', cause=e)
546

Sergey M․'s avatar
Sergey M․ committed
547
    def __maybe_fake_ip_and_retry(self, countries):
548
549
550
551
552
        if (not self._downloader.params.get('geo_bypass_country', None)
                and self._GEO_BYPASS
                and self._downloader.params.get('geo_bypass', True)
                and not self._x_forwarded_for_ip
                and countries):
553
554
            country_code = random.choice(countries)
            self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
Sergey M․'s avatar
Sergey M․ committed
555
556
            if self._x_forwarded_for_ip:
                self.report_warning(
557
558
                    'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
                    % (self._x_forwarded_for_ip, country_code.upper()))
Sergey M․'s avatar
Sergey M․ committed
559
560
561
                return True
        return False

562
563
564
565
566
567
568
569
570
571
572
573
    def set_downloader(self, downloader):
        """Sets the downloader for this IE."""
        self._downloader = downloader

    def _real_initialize(self):
        """Real initialization process. Redefine in subclasses."""
        pass

    def _real_extract(self, url):
        """Real extraction process. Redefine in subclasses."""
        pass

574
575
576
    @classmethod
    def ie_key(cls):
        """A string for getting the InfoExtractor with get_info_extractor"""
577
        return compat_str(cls.__name__[:-2])
578

579
580
    @property
    def IE_NAME(self):
581
        return compat_str(type(self).__name__[:-2])
582

583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
    @staticmethod
    def __can_accept_status_code(err, expected_status):
        assert isinstance(err, compat_urllib_error.HTTPError)
        if expected_status is None:
            return False
        if isinstance(expected_status, compat_integer_types):
            return err.code == expected_status
        elif isinstance(expected_status, (list, tuple)):
            return err.code in expected_status
        elif callable(expected_status):
            return expected_status(err.code) is True
        else:
            assert False

    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
        """
        Return the response handle.

        See _download_webpage docstring for arguments specification.
        """
603
604
605
        if note is None:
            self.report_download_webpage(video_id)
        elif note is not False:
606
            if video_id is None:
607
                self.to_screen('%s' % (note,))
608
            else:
609
                self.to_screen('%s: %s' % (video_id, note))
610
611
612
613
614
615
616
617
618
619

        # Some sites check X-Forwarded-For HTTP header in order to figure out
        # the origin of the client behind proxy. This allows bypassing geo
        # restriction by faking this header's value to IP that belongs to some
        # geo unrestricted country. We will do so once we encounter any
        # geo restriction error.
        if self._x_forwarded_for_ip:
            if 'X-Forwarded-For' not in headers:
                headers['X-Forwarded-For'] = self._x_forwarded_for_ip

620
621
622
623
        if isinstance(url_or_request, compat_urllib_request.Request):
            url_or_request = update_Request(
                url_or_request, data=data, headers=headers, query=query)
        else:
624
625
            if query:
                url_or_request = update_url_query(url_or_request, query)
626
            if data is not None or headers:
627
                url_or_request = sanitized_Request(url_or_request, data, headers)
628
629
630
        exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
        if hasattr(ssl, 'CertificateError'):
            exceptions.append(ssl.CertificateError)
631
        try:
632
            return self._downloader.urlopen(url_or_request)
633
        except tuple(exceptions) as err:
634
635
            if isinstance(err, compat_urllib_error.HTTPError):
                if self.__can_accept_status_code(err, expected_status):
636
637
638
639
640
                    # Retain reference to error to prevent file object from
                    # being closed before it can be read. Works around the
                    # effects of <https://bugs.python.org/issue15002>
                    # introduced in Python 3.4.1.
                    err.fp._error = err
641
642
                    return err.fp

643
644
            if errnote is False:
                return False
645
            if errnote is None:
646
                errnote = 'Unable to download webpage'
647

648
            errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
649
650
651
652
653
            if fatal:
                raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
            else:
                self._downloader.report_warning(errmsg)
                return False
654

655
656
657
658
659
660
    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
        """
        Return a tuple (page content as string, URL handle).

        See _download_webpage docstring for arguments specification.
        """
661
662
663
664
        # Strip hashes from the URL (#1038)
        if isinstance(url_or_request, (compat_str, str)):
            url_or_request = url_or_request.partition('#')[0]

665
        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
666
667
668
        if urlh is False:
            assert not fatal
            return False
669
        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
670
671
        return (content, urlh)

672
673
    @staticmethod
    def _guess_encoding_from_content(content_type, webpage_bytes):
674
675
676
677
        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
        if m:
            encoding = m.group(1)
        else:
678
            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
679
680
681
                          webpage_bytes[:1024])
            if m:
                encoding = m.group(1).decode('ascii')
682
683
            elif webpage_bytes.startswith(b'\xff\xfe'):
                encoding = 'utf-16'
684
685
            else:
                encoding = 'utf-8'
686
687
688

        return encoding

689
690
    def __check_blocked(self, content):
        first_block = content[:512]
691
692
        if ('<title>Access to this site is blocked</title>' in content
                and 'Websense' in first_block):
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
            blocked_iframe = self._html_search_regex(
                r'<iframe src="([^"]+)"', content,
                'Websense information URL', default=None)
            if blocked_iframe:
                msg += ' Visit %s for more details' % blocked_iframe
            raise ExtractorError(msg, expected=True)
        if '<title>The URL you requested has been blocked</title>' in first_block:
            msg = (
                'Access to this webpage has been blocked by Indian censorship. '
                'Use a VPN or proxy server (with --proxy) to route around it.')
            block_msg = self._html_search_regex(
                r'</h1><p>(.*?)</p>',
                content, 'block message', default=None)
            if block_msg:
                msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
            raise ExtractorError(msg, expected=True)
710
711
        if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
                and 'blocklist.rkn.gov.ru' in content):
712
713
714
715
716
            raise ExtractorError(
                'Access to this webpage has been blocked by decision of the Russian government. '
                'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
                expected=True)

717
718
719
720
721
722
723
    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
        content_type = urlh.headers.get('Content-Type', '')
        webpage_bytes = urlh.read()
        if prefix is not None:
            webpage_bytes = prefix + webpage_bytes
        if not encoding:
            encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
724
        if self._downloader.params.get('dump_intermediate_pages', False):
725
            self.to_screen('Dumping request to ' + urlh.geturl())
726
727
            dump = base64.b64encode(webpage_bytes).decode('ascii')
            self._downloader.to_screen(dump)
728
        if self._downloader.params.get('write_pages', False):
729
            basen = '%s_%s' % (video_id, urlh.geturl())
730
            if len(basen) > 240:
731
                h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
732
733
                basen = basen[:240 - len(h)] + h
            raw_filename = basen + '.dump'
734
            filename = sanitize_filename(raw_filename, restricted=True)
735
            self.to_screen('Saving request to ' + filename)
736
737
            # Working around MAX_PATH limitation on Windows (see
            # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
738
            if compat_os_name == 'nt':
739
740
741
                absfilepath = os.path.abspath(filename)
                if len(absfilepath) > 259:
                    filename = '\\\\?\\' + absfilepath
742
743
744
            with open(filename, 'wb') as outf:
                outf.write(webpage_bytes)

745
746
747
748
        try:
            content = webpage_bytes.decode(encoding, 'replace')
        except LookupError:
            content = webpage_bytes.decode('utf-8', 'replace')
749

750
        self.__check_blocked(content)
751

752
        return content
753

754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
    def _download_webpage(
            self, url_or_request, video_id, note=None, errnote=None,
            fatal=True, tries=1, timeout=5, encoding=None, data=None,
            headers={}, query={}, expected_status=None):
        """
        Return the data of the page as a string.

        Arguments:
        url_or_request -- plain text URL as a string or
            a compat_urllib_request.Requestobject
        video_id -- Video/playlist/item identifier (string)

        Keyword arguments:
        note -- note printed before downloading (string)
        errnote -- note printed in case of an error (string)
        fatal -- flag denoting whether error should be considered fatal,
            i.e. whether it should cause ExtractionError to be raised,
            otherwise a warning will be reported and extraction continued
        tries -- number of tries
        timeout -- sleep interval between tries
        encoding -- encoding for a page content decoding, guessed automatically
            when not explicitly specified
        data -- POST data (bytes)
        headers -- HTTP headers (dict)
        query -- URL query (dict)
        expected_status -- allows to accept failed HTTP requests (non 2xx
            status code) by explicitly specifying a set of accepted status
            codes. Can be any of the following entities:
                - an integer type specifying an exact failed status code to
                  accept
                - a list or a tuple of integer types specifying a list of
                  failed status codes to accept
                - a callable accepting an actual failed status code and
                  returning True if it should be accepted
            Note that this argument does not affect success status codes (2xx)
            which are always accepted.
        """

792
793
794
795
        success = False
        try_count = 0
        while success is False:
            try:
796
797
798
799
                res = self._download_webpage_handle(
                    url_or_request, video_id, note, errnote, fatal,
                    encoding=encoding, data=data, headers=headers, query=query,
                    expected_status=expected_status)
800
801
802
803
804
805
                success = True
            except compat_http_client.IncompleteRead as e:
                try_count += 1
                if try_count >= tries:
                    raise e
                self._sleep(timeout, video_id)
806
807
808
809
810
        if res is False:
            return res
        else:
            content, _ = res
            return content
811

812
813
814
    def _download_xml_handle(
            self, url_or_request, video_id, note='Downloading XML',
            errnote='Unable to download XML', transform_source=None,
815
816
817
            fatal=True, encoding=None, data=None, headers={}, query={},
            expected_status=None):
        """
Sergey M․'s avatar
Sergey M․ committed
818
        Return a tuple (xml as an compat_etree_Element, URL handle).
819
820
821

        See _download_webpage docstring for arguments specification.
        """
822
823
        res = self._download_webpage_handle(
            url_or_request, video_id, note, errnote, fatal=fatal,
824
825
            encoding=encoding, data=data, headers=headers, query=query,
            expected_status=expected_status)
826
827
828
829
830
831
832
        if res is False:
            return res
        xml_string, urlh = res
        return self._parse_xml(
            xml_string, video_id, transform_source=transform_source,
            fatal=fatal), urlh

833
834
835
836
837
838
    def _download_xml(
            self, url_or_request, video_id,
            note='Downloading XML', errnote='Unable to download XML',
            transform_source=None, fatal=True, encoding=None,
            data=None, headers={}, query={}, expected_status=None):
        """
Sergey M․'s avatar
Sergey M․ committed
839
        Return the xml as an compat_etree_Element.
840
841
842

        See _download_webpage docstring for arguments specification.
        """
843
844
845
        res = self._download_xml_handle(
            url_or_request, video_id, note=note, errnote=errnote,
            transform_source=transform_source, fatal=fatal, encoding=encoding,
846
847
            data=data, headers=headers, query=query,
            expected_status=expected_status)
848
        return res if res is False else res[0]
849
850

    def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
851
852
        if transform_source:
            xml_string = transform_source(xml_string)
853
854
855
856
857
858
859
860
        try:
            return compat_etree_fromstring(xml_string.encode('utf-8'))
        except compat_xml_parse_error as ve:
            errmsg = '%s: Failed to parse XML ' % video_id
            if fatal:
                raise ExtractorError(errmsg, cause=ve)
            else:
                self.report_warning(errmsg + str(ve))
861

862
863
864
    def _download_json_handle(
            self, url_or_request, video_id, note='Downloading JSON metadata',
            errnote='Unable to download JSON metadata', transform_source=None,
865
866
867
868
869
870
871
            fatal=True, encoding=None, data=None, headers={}, query={},
            expected_status=None):
        """
        Return a tuple (JSON object, URL handle).

        See _download_webpage docstring for arguments specification.
        """
872
        res = self._download_webpage_handle(
873
            url_or_request, video_id, note, errnote, fatal=fatal,
874
875
            encoding=encoding, data=data, headers=headers, query=query,
            expected_status=expected_status)
876
877
878
        if res is False:
            return res
        json_string, urlh = res
Tithen-Firion's avatar
Tithen-Firion committed
879
        return self._parse_json(
880
881
882
883
884
885
            json_string, video_id, transform_source=transform_source,
            fatal=fatal), urlh

    def _download_json(
            self, url_or_request, video_id, note='Downloading JSON metadata',
            errnote='Unable to download JSON metadata', transform_source=None,
886
887
888
889
890
891
892
            fatal=True, encoding=None, data=None, headers={}, query={},
            expected_status=None):
        """
        Return the JSON object as a dict.

        See _download_webpage docstring for arguments specification.
        """
893
894
895
        res = self._download_json_handle(
            url_or_request, video_id, note=note, errnote=errnote,
            transform_source=transform_source, fatal=fatal, encoding=encoding,
896
897
            data=data, headers=headers, query=query,
            expected_status=expected_status)
898
        return res if res is False else res[0]
Tithen-Firion's avatar
Tithen-Firion committed
899
900

    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
901
902
        if transform_source:
            json_string = transform_source(json_string)
903
904
905
        try:
            return json.loads(json_string)
        except ValueError as ve:
906
907
908
909
910
            errmsg = '%s: Failed to parse JSON ' % video_id
            if fatal:
                raise ExtractorError(errmsg, cause=ve)
            else:
                self.report_warning(errmsg + str(ve))
911

912
    def report_warning(self, msg, video_id=None):
913
        idstr = '' if video_id is None else '%s: ' % video_id
914
        self._downloader.report_warning(
915
            '[%s] %s%s' % (self.IE_NAME, idstr, msg))
916

917
918
    def to_screen(self, msg):
        """Print msg to screen, prefixing it with '[ie_name]'"""
919
        self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
920
921
922

    def report_extraction(self, id_or_name):
        """Report information extraction."""
923
        self.to_screen('%s: Extracting information' % id_or_name)
924
925
926

    def report_download_webpage(self, video_id):
        """Report webpage download."""
927
        self.to_screen('%s: Downloading webpage' % video_id)
928
929
930

    def report_age_confirmation(self):
        """Report attempt to confirm age."""
931
        self.to_screen('Confirming age')
932

933
934
    def report_login(self):
        """Report attempt to log in."""
935
        self.to_screen('Logging in')
936

937
938
939
940
941
942
    @staticmethod
    def raise_login_required(msg='This video is only available for registered users'):
        raise ExtractorError(
            '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
            expected=True)

943
    @staticmethod
944
945
    def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
        raise GeoRestrictedError(msg, countries=countries)
946

Jouke Waleson's avatar
Jouke Waleson committed
947
    # Methods for following #608
948
    @staticmethod
949
    def url_result(url, ie=None, video_id=None, video_title=None):
950
        """Returns a URL that points to a page that should be processed"""
Jouke Waleson's avatar
Jouke Waleson committed
951
        # TODO: ie should be the class used for getting the info
952
953
954
        video_info = {'_type': 'url',
                      'url': url,
                      'ie_key': ie}
955
956
        if video_id is not None:
            video_info['id'] = video_id
957
958
        if video_title is not None:
            video_info['title'] = video_title
959
        return video_info
Jouke Waleson's avatar
Jouke Waleson committed
960

961
962
    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
        urls = orderedSet(
963
964
965
            self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
            for m in matches)
        return self.playlist_result(
966
            urls, playlist_id=playlist_id, playlist_title=playlist_title)
967

968
    @staticmethod
969
    def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
970
971
972
973
974
975
976
        """Returns a playlist"""
        video_info = {'_type': 'playlist',
                      'entries': entries}
        if playlist_id:
            video_info['id'] = playlist_id
        if playlist_title:
            video_info['title'] = playlist_title
977
978
        if playlist_description:
            video_info['description'] = playlist_description
979
980
        return video_info

981
    def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
982
983
984
985
        """
        Perform a regex search on the given string, using a single or a list of
        patterns returning the first matching group.
        In case of failure return a default value or raise a WARNING or a
986
        RegexNotFoundError, depending on fatal, specifying the field name.
987
988
989
990
991
992
        """
        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
            mobj = re.search(pattern, string, flags)
        else:
            for p in pattern:
                mobj = re.search(p, string, flags)
Philipp Hagemeister's avatar
Philipp Hagemeister committed
993
994
                if mobj:
                    break
995

996
        if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
997
            _name = '\033[0;34m%s\033[0m' % name
998
999
1000
        else:
            _name = name