[hentainexus] add gallery extractor (#256)

mikf · May 12, 2019 · ba8eb1f · ba8eb1f
1 parent bd9cb3d
commit ba8eb1f
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 0 deletions.
diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst
@@ -36,6 +36,7 @@ Hentai Foundry       https://www.hentai-foundry.com/     |hentaifoundry-C|
 Hentai2Read          https://hentai2read.com/            Chapters, Manga
 HentaiFox            https://hentaifox.com/              Galleries, Search Results
 HentaiHere           https://hentaihere.com/             Chapters, Manga
+Hentainexus          https://hentainexus.com/            Galleries
 Hitomi.la            https://hitomi.la/                  Galleries
 Hypnohub             https://hypnohub.net/               Pools, Popular Images, Posts, Tag-Searches
 Idol Complex         https://idol.sankakucomplex.com/    Pools, Posts, Tag-Searches                         Optional

diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -34,6 +34,7 @@
     "hentaifoundry",
     "hentaifox",
     "hentaihere",
+    "hentainexus",
     "hitomi",
     "hypnohub",
     "idolcomplex",

diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hentainexus.com/"""
+
+from .common import GalleryExtractor
+from .. import text, util
+import json
+
+
+class HentainexusGalleryExtractor(GalleryExtractor):
+    """Extractor for image galleries on hentainexus.com"""
+    category = "hentainexus"
+    root = "https://hentainexus.com"
+    pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com"
+               r"/(?:view|read)/(\d+)")
+    test = (
+        ("https://hentainexus.com/view/5688", {
+            "url": "57238d6e76a199298c9866bbcfaa111c0fa164b0",
+            "keyword": "5b254937a180b5c2cef303324cd5f7f6fec98d55",
+        }),
+        ("https://hentainexus.com/read/5688"),
+    )
+
+    def __init__(self, match):
+        self.gallery_id = match.group(1)
+        url = "{}/view/{}".format(self.root, self.gallery_id)
+        GalleryExtractor.__init__(self, match, url)
+
+    def metadata(self, page):
+        rmve = text.remove_html
+        extr = text.extract_from(page)
+        data = {
+            "gallery_id" : text.parse_int(self.gallery_id),
+            "tags"       : extr('"og:description" content="', '"').split(", "),
+            "thumbnail"  : extr('"og:image" content="', '"'),
+            "title"      : extr('<h1 class="title">', '</h1>'),
+            "artist"     : rmve(extr('viewcolumn">Artist</td>'     , '</td>')),
+            "book"       : rmve(extr('viewcolumn">Book</td>'       , '</td>')),
+            "language"   : rmve(extr('viewcolumn">Language</td>'   , '</td>')),
+            "magazine"   : rmve(extr('viewcolumn">Magazine</td>'   , '</td>')),
+            "parody"     : rmve(extr('viewcolumn">Parody</td>'     , '</td>')),
+            "publisher"  : rmve(extr('viewcolumn">Publisher</td>'  , '</td>')),
+            "description": rmve(extr('viewcolumn">Description</td>', '</td>')),
+        }
+        data["lang"] = util.language_to_code(data["language"])
+        return data
+
+    def images(self, page):
+        url = "{}/read/{}".format(self.root, self.gallery_id)
+        extr = text.extract_from(self.request(url).text)
+        imgs = extr("initReader(", "]") + "]"
+        base = extr('"', '"')
+
+        return [(base + img, None) for img in json.loads(imgs)]