[twitter] add option to extract TwitPic embeds (#579)

mikf · Jan 18, 2020 · 25d5ec4 · 25d5ec4
1 parent 254f7c3
commit 25d5ec4
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 1 deletion.
diff --git a/docs/configuration.rst b/docs/configuration.rst
@@ -1067,6 +1067,15 @@ Description Extract images from retweets.
 =========== =====
 
 
+extractor.twitter.twitpic
+-------------------------
+=========== =====
+Type        ``bool``
+Default     ``false``
+Description Extract `TwitPic <https://twitpic.com/>`__ embeds.
+=========== =====
+
+
 extractor.twitter.videos
 ------------------------
 =========== =====

diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
@@ -144,6 +144,7 @@
         {
             "content": false,
             "retweets": true,
+            "twitpic": false,
             "videos": false
         },
         "vsco":

diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
@@ -30,6 +30,7 @@ def __init__(self, match):
         self._user_dict = None
         self.logged_in = False
         self.retweets = self.config("retweets", True)
+        self.twitpic = self.config("twitpic", False)
         self.content = self.config("content", False)
         self.videos = self.config("videos", False)
 
@@ -79,6 +80,26 @@ def items(self):
                     urls = [url + size for size in self.sizes]
                     yield Message.Urllist, urls, data
 
+            if self.twitpic and "//twitpic.com/" in tweet:
+                urls = [
+                    url for url in text.extract_iter(
+                        tweet, 'data-expanded-url="', '"')
+                    if "//twitpic.com/" in url
+                ]
+
+                if "num" not in data:
+                    if urls:
+                        yield Message.Directory, data
+                    data["num"] = 0
+
+                for data["num"], url in enumerate(urls, data["num"]+1):
+                    response = self.request(url, fatal=False)
+                    if response.status_code >= 400:
+                        continue
+                    url = text.extract(
+                        response.text, 'name="twitter:image" value="', '"')[0]
+                    yield Message.Url, url, text.nameext_from_url(url, data)
+
     def metadata(self):
         """Return general metadata"""
         return {}
@@ -230,7 +251,7 @@ def _tweets_from_api(self, url, max_position=None):
             for tweet in text.extract_iter(
                     data["items_html"], '<div class="tweet ', '\n</li>'):
                 yield tweet
-                
+
             if data.get("min_position") is None:
                 if data["has_more_items"] and "min_position" not in data:
                     pass
@@ -348,6 +369,12 @@ class TwitterTweetExtractor(TwitterExtractor):
             "count": 4,
             "keyword": "0c627af2b8cdccc7e0da8fd221155c4a4a3141a8",
         }),
+        # TwitPic embeds (#579)
+        ("https://twitter.com/i/web/status/112900228289540096", {
+            "options": (("twitpic", True),),
+            "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
+            "count": 3,
+        }),
     )
 
     def __init__(self, match):