Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Facebook info extractor #76

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 225 additions & 0 deletions youtube-dl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# Author: Paweł Paprota
# License: Public domain code
import cookielib
import codecs
import ctypes
import datetime
import email.utils
Expand Down Expand Up @@ -2339,6 +2340,228 @@ class DepositFilesIE(InfoExtractor):
except UnavailableVideoError, err:
self._downloader.trouble(u'ERROR: unable to download file')

class FacebookIE(InfoExtractor):
"""Information Extractor for Facebook"""

_VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
_LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
_NETRC_MACHINE = 'facebook'
_available_formats = ['highqual', 'lowqual']
_video_extensions = {
'highqual': 'mp4',
'lowqual': 'mp4',
}

def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)

@staticmethod
def suitable(url):
return (re.match(FacebookIE._VALID_URL, url) is not None)

def _reporter(self, message):
"""Add header and report message."""
self._downloader.to_screen(u'[Facebook] %s' % message)

def report_login(self):
"""Report attempt to log in."""
self._reporter(u'Logging in')

def report_video_webpage_download(self, video_id):
"""Report attempt to download video webpage."""
self._reporter(u'%s: Downloading video webpage' % video_id)

def report_information_extraction(self, video_id):
"""Report attempt to extract video information."""
self._reporter(u'%s: Extracting video information' % video_id)

def _parse_page(self, video_webpage):
"""Extract video information from page"""
# General data
data = {'title': r'class="video_title datawrap">(.*?)</',
'description': r'<div class="datawrap">(.*?)</div>',
'owner': r'\("video_owner_name", "(.*?)"\)',
'upload_date': r'data-date="(.*?)"',
'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
}
video_info = {}
for piece in data.keys():
mobj = re.search(data[piece], video_webpage)
if mobj is not None:
video_info[piece] = urllib.unquote_plus(codecs.unicode_escape_decode(mobj.group(1))[0])

# Video urls
video_urls = {}
for format in self._available_formats:
mobj = re.search(r'\("%s_src\", "(.+?)"\)' % format, video_webpage)
if mobj is not None:
# URL is within a Javascript segment in an escaped Unicode format within the generally utf-8 page
# E.g: "https\u00253A\u00252F\u00252Ffbcdn-video-a.akamaihd.net..."
video_urls[format] = urllib.unquote_plus(codecs.unicode_escape_decode(mobj.group(1))[0])
video_info['video_urls'] = video_urls

return video_info

def _real_initialize(self):
if self._downloader is None:
return

useremail = None
password = None
downloader_params = self._downloader.params

# Attempt to use provided username and password or .netrc data
if downloader_params.get('username', None) is not None:
useremail = downloader_params['username']
password = downloader_params['password']
elif downloader_params.get('usenetrc', False):
try:
info = netrc.netrc().authenticators(self._NETRC_MACHINE)
if info is not None:
useremail = info[0]
password = info[2]
else:
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError), err:
self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
return

if useremail is None:
return

# Log in
login_form = { 'email': useremail,
'pass': password,
'login': 'Log+In'
}
request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
try:
self.report_login()
login_results = urllib2.urlopen(request).read()
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
return
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
return

def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
return
video_id = mobj.group('ID')

# Get video webpage
self.report_video_webpage_download(video_id)
request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
try:
page = urllib2.urlopen(request)
video_webpage = page.read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
return

# Start extracting information
self.report_information_extraction(video_id)

# Extract information
video_info = self._parse_page(video_webpage)

# uploader
if 'owner' not in video_info:
self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
return
video_uploader = video_info['owner']

# title
if 'title' not in video_info:
self._downloader.trouble(u'ERROR: unable to extract video title')
return
video_title = video_info['title']
video_title = video_title.decode('utf-8')
video_title = sanitize_title(video_title)

# simplified title
simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
simple_title = simple_title.strip(ur'_')

# thumbnail image
if 'thumbnail' not in video_info:
self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
video_thumbnail = ''
else:
video_thumbnail = video_info['thumbnail']

# upload date
upload_date = u'NA'
if 'upload_date' in video_info:
upload_time = video_info['upload_date']
timetuple = email.utils.parsedate_tz(upload_time)
if timetuple is not None:
try:
upload_date = time.strftime('%Y%m%d', timetuple[0:9])
except:
pass

# description
video_description = 'No description available.'
if (self._downloader.params.get('forcedescription', False) and
'description' in video_info):
video_description = video_info['description']

url_map = video_info['video_urls']
if len(url_map.keys()) > 0:
# Decide which formats to download
req_format = self._downloader.params.get('format', None)
format_limit = self._downloader.params.get('format_limit', None)

if format_limit is not None and format_limit in self._available_formats:
format_list = self._available_formats[self._available_formats.index(format_limit):]
else:
format_list = self._available_formats
existing_formats = [x for x in format_list if x in url_map]
if len(existing_formats) == 0:
self._downloader.trouble(u'ERROR: no known formats available for video')
return
if req_format is None:
video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
elif req_format == '-1':
video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
else:
# Specific format
if req_format not in url_map:
self._downloader.trouble(u'ERROR: requested format not available')
return
video_url_list = [(req_format, url_map[req_format])] # Specific format

for format_param, video_real_url in video_url_list:

# At this point we have a new video
self._downloader.increment_downloads()

# Extension
video_extension = self._video_extensions.get(format_param, 'mp4')

# Find the video URL in fmt_url_map or conn paramters
try:
# Process video information
self._downloader.process_info({
'id': video_id.decode('utf-8'),
'url': video_real_url.decode('utf-8'),
'uploader': video_uploader.decode('utf-8'),
'upload_date': upload_date,
'title': video_title,
'stitle': simple_title,
'ext': video_extension.decode('utf-8'),
'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
'thumbnail': video_thumbnail.decode('utf-8'),
'description': video_description.decode('utf-8'),
'player_url': None,
})
except UnavailableVideoError, err:
self._downloader.trouble(u'\nERROR: unable to download video')

class PostProcessor(object):
"""Post Processor class.

Expand Down Expand Up @@ -2594,6 +2817,7 @@ if __name__ == '__main__':
yahoo_ie = YahooIE()
yahoo_search_ie = YahooSearchIE(yahoo_ie)
deposit_files_ie = DepositFilesIE()
facebook_ie = FacebookIE()
generic_ie = GenericIE()

# File downloader
Expand Down Expand Up @@ -2645,6 +2869,7 @@ if __name__ == '__main__':
fd.add_info_extractor(yahoo_ie)
fd.add_info_extractor(yahoo_search_ie)
fd.add_info_extractor(deposit_files_ie)
fd.add_info_extractor(facebook_ie)

# This must come last since it's the
# fallback if none of the others work
Expand Down