Skip to content

Commit

Permalink
use httpx or reuqests if available
Browse files Browse the repository at this point in the history
  • Loading branch information
URenko committed Mar 29, 2023
1 parent 1041c00 commit 8510232
Showing 1 changed file with 111 additions and 67 deletions.
178 changes: 111 additions & 67 deletions src/you_get/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,22 @@
from importlib import import_module
from urllib import request, parse, error

try:
import httpx
session = httpx.Client(transport=httpx.HTTPTransport(retries=3), follow_redirects=True, http2=True,
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0'}) # some website can accept 'Python-urllib' or 'python-requests' but not 'httpx'
__urllib__ = False
except ImportError:
try:
import requests
from requests.adapters import HTTPAdapter
session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=3))
session.mount('https://', HTTPAdapter(max_retries=3))
__urllib__ = False
except ImportError:
__urllib__ = True

from .version import __version__
from .util import log, term
from .util.git import get_version
Expand Down Expand Up @@ -346,6 +362,12 @@ def undeflate(data):
# an http.client implementation of get_content()
# because urllib does not support "Connection: keep-alive"
def getHttps(host, url, headers, debuglevel=0):
if not __urllib__:
if not (url.startswith('http://') or url.startswith('https://')):
url = 'https://' + host + url
r = session.get(url, headers=headers)
return r.text, r.headers.get('set-cookie')

import http.client

conn = http.client.HTTPSConnection(host)
Expand Down Expand Up @@ -377,6 +399,9 @@ def get_decoded_html(url, faker=False):
def get_location(url, headers=None, get_method='HEAD'):
logging.debug('get_location: %s' % url)

if not __urllib__:
return str(session.request(get_method, url, headers=headers).url)

if headers:
req = request.Request(url, headers=headers)
else:
Expand Down Expand Up @@ -423,6 +448,11 @@ def get_content(url, headers={}, decoded=True):

logging.debug('get_content: %s' % url)

if not __urllib__:
if cookies: session.cookies = cookies # https://www.python-httpx.org/compatibility/#cookies
r = session.get(url, headers=headers)
return r.text if decoded else r.content

req = request.Request(url, headers=headers)
if cookies:
# NOTE: Do not use cookies.add_cookie_header(req)
Expand Down Expand Up @@ -476,6 +506,16 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
else:
logging.debug('post_content: %s\npost_data: %s' % (url, post_data))

if not __urllib__:
if cookies: session.cookies = cookies # https://www.python-httpx.org/compatibility/#cookies
r = session.post(url, headers=headers, data=kwargs.get('post_data_raw') or post_data) # https://www.python-httpx.org/compatibility/#request-content
return r.text if decoded else r.content

if kwargs.get('post_data_raw'):
post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8')
else:
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')

req = request.Request(url, headers=headers)
if cookies:
# NOTE: Do not use cookies.add_cookie_header(req)
Expand All @@ -489,10 +529,6 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):
cookie_strings.append(cookie.name + '=' + cookie.value)
cookie_headers = {'Cookie': '; '.join(cookie_strings)}
req.headers.update(cookie_headers)
if kwargs.get('post_data_raw'):
post_data_enc = bytes(kwargs['post_data_raw'], 'utf-8')
else:
post_data_enc = bytes(parse.urlencode(post_data), 'utf-8')
response = urlopen_with_retry(req, data=post_data_enc)
data = response.read()

Expand All @@ -517,14 +553,10 @@ def post_content(url, headers={}, post_data={}, decoded=True, **kwargs):


def url_size(url, faker=False, headers={}):
if faker:
response = urlopen_with_retry(
request.Request(url, headers=fake_headers)
)
elif headers:
response = urlopen_with_retry(request.Request(url, headers=headers))
if __urllib__:
response = urlopen_with_retry(request.Request(url, headers=fake_headers if faker else headers))
else:
response = urlopen_with_retry(url)
response = session.head(url, headers=fake_headers if faker else headers)

size = response.headers['content-length']
return int(size) if size is not None else float('inf')
Expand All @@ -534,13 +566,13 @@ def urls_size(urls, faker=False, headers={}):
return sum([url_size(url, faker=faker, headers=headers) for url in urls])


def get_head(url, headers=None, get_method='HEAD'):
def get_head(url, headers={}, get_method='HEAD'):
logging.debug('get_head: %s' % url)

if headers:
req = request.Request(url, headers=headers)
else:
req = request.Request(url)
if not __urllib__:
return session.request(get_method, url, headers=headers).headers

req = request.Request(url, headers=headers)
req.get_method = lambda: get_method
res = urlopen_with_retry(req)
return res.headers
Expand Down Expand Up @@ -607,6 +639,16 @@ def url_info(url, faker=False, headers={}):

return type, ext, size

def iter_content(response):
while True:
try:
buffer = response.read(1024 * 256)
except socket.timeout:
break
if buffer:
yield buffer
else:
break

def url_save(
url, filepath, bar, refer=None, is_part=False, faker=False,
Expand Down Expand Up @@ -703,66 +745,68 @@ def numreturn(a):
else:
headers = {}
'''
if received:
# chunk_start will always be 0 if not chunked
tmp_headers['Range'] = 'bytes=' + str(received - chunk_start) + '-'
if refer:
tmp_headers['Referer'] = refer

if timeout:
response = urlopen_with_retry(
request.Request(url, headers=tmp_headers), timeout=timeout
)
else:
response = urlopen_with_retry(
request.Request(url, headers=tmp_headers)
)
try:
range_start = int(
response.headers[
'content-range'
][6:].split('/')[0].split('-')[0]
)
end_length = int(
response.headers['content-range'][6:].split('/')[1]
)
range_length = end_length - range_start
except:
content_length = response.headers['content-length']
range_length = int(content_length) if content_length is not None \
else float('inf')

if is_chunked: # always append if chunked
open_mode = 'ab'
elif file_size != received + range_length: # is it ever necessary?
received = 0
if bar:
bar.received = 0
open_mode = 'wb'

with open(temp_filepath, open_mode) as output:
while True:
buffer = None
while True:
if received:
# chunk_start will always be 0 if not chunked
tmp_headers['Range'] = 'bytes=' + str(received - chunk_start) + '-'
if __urllib__:
if timeout:
_response = urlopen_with_retry(
request.Request(url, headers=tmp_headers), timeout=timeout
)
else:
_response = urlopen_with_retry(
request.Request(url, headers=tmp_headers)
)
elif callable(session.stream): # HTTPX
_response = session.stream('GET', url, headers=tmp_headers, timeout=timeout)
else: # requests
_response = session.get(url, headers=tmp_headers, timeout=timeout, stream=True)
with _response as response:
try:
buffer = response.read(1024 * 256)
except socket.timeout:
pass
if not buffer:
range_start = int(
response.headers[
'content-range'
][6:].split('/')[0].split('-')[0]
)
end_length = int(
response.headers['content-range'][6:].split('/')[1]
)
range_length = end_length - range_start
except:
content_length = response.headers['content-length']
range_length = int(content_length) if content_length is not None \
else float('inf')

if is_chunked: # always append if chunked
open_mode = 'ab'
elif file_size != received + range_length: # is it ever necessary?
received = 0
if bar:
bar.received = 0
open_mode = 'wb'

with open(temp_filepath, open_mode) as output:
if __urllib__:
iter = iter_content(response)
elif hasattr(response, 'iter_content'): # HTTPX
iter = response.iter_content(1024 * 256)
else: # requests
iter = response.iter_bytes(1024 * 256)
for buffer in iter:
output.write(buffer)
received += len(buffer)
received_chunk += len(buffer)
if bar:
bar.update_received(len(buffer))
if is_chunked and received_chunk == range_length:
break
elif not is_chunked and received == file_size: # Download finished
break
# Unexpected termination. Retry request
tmp_headers['Range'] = 'bytes=' + str(received - chunk_start) + '-'
response = urlopen_with_retry(
request.Request(url, headers=tmp_headers)
)
continue
output.write(buffer)
received += len(buffer)
received_chunk += len(buffer)
if bar:
bar.update_received(len(buffer))

assert received == os.path.getsize(temp_filepath), '%s == %s == %s' % (
received, os.path.getsize(temp_filepath), temp_filepath
Expand Down

0 comments on commit 8510232

Please sign in to comment.