diff --git a/socid_extractor/schemes.py b/socid_extractor/schemes.py index c566e08..bca213c 100644 --- a/socid_extractor/schemes.py +++ b/socid_extractor/schemes.py @@ -549,10 +549,20 @@ }, 'Gitlab API': { 'flags': ['"web_url":"https://gitlab.com/'], - 'regex': r'^({[\S\s]+?})$', + 'regex': r'^([{[\S\s]+?}])$', 'extract_json': True, + 'url_mutations': [ + { + 'from': r'https://gitlab.com/(?P.+)/?', + 'to': 'https://gitlab.com/api/v4/users?username={username}', + } + ], 'fields': { 'uid': lambda x: x[0].get('id'), + 'fullname': lambda x: x[0].get('name'), + 'username': lambda x: x[0].get('username'), + 'state': lambda x: x[0].get('state'), + 'image': lambda x: x[0].get('avatar_url'), } }, 'My Mail.ru': { @@ -1551,4 +1561,159 @@ 'is_verified': lambda x: x['isVerified'], } }, + 'Wattpad API': { + 'flags': ['{"username":"'], + 'regex': r'^({"username":"(.+)})$', + 'extract_json': True, + 'url_mutations': [ + { + 'from': r'https?://(www.|a.)?wattpad.com/user/(?P[^/]+).*', + 'to': 'https://www.wattpad.com/api/v3/users/{username}', + } + ], + 'fields': { + 'username': lambda x: x.get('username'), + 'image': lambda x: x.get('avatar'), + 'image_bg': lambda x: x.get('backgroundUrl'), + 'fullname': lambda x: x.get('name'), + 'description': lambda x: x.get('description'), + 'status': lambda x: x.get('status'), + 'gender': lambda x: x.get('gender'), + 'locale': lambda x: x.get('locale'), + 'created_at': lambda x: x.get('createDate'), + 'updated_at': lambda x: x.get('modifyDate'), + 'location': lambda x: x.get('location'), + 'isPrivate': lambda x: x.get('isPrivate'), + 'verified': lambda x: x.get('verified'), + 'verified_email': lambda x: x.get('verified_email'), + 'ambassador': lambda x: x.get('ambassador'), + 'isMuted': lambda x: x.get('isMuted'), + 'allowCrawler': lambda x: x.get('allowCrawler'), + 'follower_count': lambda x: x.get('numFollowers'), + 'following_count': lambda x: x.get('numFollowing'), + 'facebook': lambda x: 'https://www.facebook.com/' + x.get('facebook') if x.get('facebook') else None, + 'twitter': lambda x: 'https://twitter.com/' + x.get('twitter') if x.get('twitter') else None, + 'website': lambda x: x.get('website'), + 'lulu': lambda x: x.get('lulu'), + 'smashwords': lambda x: x.get('smashwords'), + 'bubok': lambda x: x.get('bubok'), + } + }, + 'Kik': { + 'flags': ['{"firstName":"'], + 'regex': r'^({[\S\s]+?})$', + 'extract_json': True, + 'url_mutations': [ + { + 'from': r'https?://kik.me/(?P[^/]+).*', + 'to': 'https://ws2.kik.com/user/{username}', + } + ], + 'fields': { + 'fullname': lambda x: x.get('firstName') + ' ' + x.get('lastName'), + 'image': lambda x: x.get('displayPic'), + 'update_pic_at': lambda x: timestamp_to_datetime(x.get('displayPicLastModified')), + } + }, + 'Docker Hub API': { + 'flags': ['{"id": "'], + 'regex': r'^({[\S\s]+?})$', + 'extract_json': True, + 'url_mutations': [ + { + 'from': r'https?://hub.docker.com/u/(?P[^/]+).*', + 'to': 'https://hub.docker.com/v2/users/{username}/', + } + ], + 'fields': { + 'uid': lambda x: x.get('id'), + 'username': lambda x: x.get('username'), + 'full_name': lambda x: x.get('full_name'), + 'location': lambda x: x.get('location'), + 'company': lambda x: x.get('company'), + 'created_at': lambda x: x.get('data_joined'), + 'type': lambda x: x.get('type'), + 'image': lambda x: x.get('gravatar_url'), + } + }, + 'Mixcloud API': { + 'flags': ['"key": "'], + 'regex': r'^({[\S\s]+?})$', + 'extract_json': True, + 'url_mutations': [ + { + 'from': r'https?://(www.)?mixcloud.com/(?P[^/]+).*', + 'to': 'https://api.mixcloud.com/{username}/', + } + ], + 'fields': { + 'fullname': lambda x: x.get('fullname'), + 'username': lambda x: x.get('username'), + 'country': lambda x: x.get('country'), + 'city': lambda x: x.get('city'), + 'created_at': lambda x: x.get('created_time'), + 'updated_at': lambda x: x.get('updated_time'), + 'description': lambda x: x.get('blog'), + 'image': lambda x: x['pictures'].get('640wx640h'), + 'follower_count': lambda x: x.get('follower_count'), + 'following_count': lambda x: x.get('following_count'), + 'cloudcast_count': lambda x: x.get('cloudcast_count'), + 'favorite_count': lambda x: x.get('favorite_count'), + 'listen_count': lambda x: x.get('listen_count'), + 'is_pro': lambda x: x.get('is_pro'), + 'is_premium': lambda x: x.get('is_premium'), + } + }, + 'binarysearch API': { + 'flags': [',"preferredSubmissionPrivacy":'], + 'regex': r'^({[\S\s]+?})$', + 'extract_json': True, + 'url_mutations': [ + { + 'from': r'https?://binarysearch.com/@/(?P[^/]+).*', + 'to': 'https://binarysearch.com/api/users/{username}/profile', + } + ], + 'fields': { + 'uid': lambda x: x['user'].get('id'), + 'username': lambda x: x['user'].get('username'), + 'image': lambda x: x['user'].get('profilePic'), + 'location': lambda x: x['user'].get('location'), + 'created_at': lambda x: timestamp_to_datetime(x['user'].get('createTime')), + 'updated_at': lambda x: timestamp_to_datetime(x['user'].get('updateTime')), + 'bio': lambda x: x['user'].get('bio'), + 'work': lambda x: x['user'].get('work'), + 'college': lambda x: x['user'].get('college'), + 'Role': lambda x: x['user'].get('preferredRole'), + 'github_url': lambda x: x['user'].get('githubHandle'), + 'twitter_url': lambda x: x['user'].get('twitterHandle'), + 'linkedin_url': lambda x: x['user'].get('linkedinHandle'), + 'links': lambda x: x['user'].get('personalWebsite'), + 'isAdmin': lambda x: x['user'].get('isAdmin'), + 'isVerified': lambda x: x['user'].get('isVerified'), + 'HistoryPublic': lambda x: x['user'].get('preferredHistoryPublic'), + 'RoomPublic': lambda x: x['user'].get('preferredRoomPublic'), + 'InviteOnly': lambda x: x['user'].get('preferredInviteOnly'), + } + }, + 'pr0gramm API': { + 'flags': [',"likesArePublic":'], + 'regex': r'^({[\S\s]+?})$', + 'extract_json': True, + 'url_mutations': [ + { + 'from': r'https?://pr0gramm.com/user/(?P[^/]+).*', + 'to': 'https://pr0gramm.com/api/profile/info?name={username}', + } + ], + 'fields': { + 'uid': lambda x: x['user'].get('id'), + 'username': lambda x: x['user'].get('name'), + 'created_at': lambda x: timestamp_to_datetime(x['user'].get('registered')), + 'uploadCount': lambda x: x.get('uploadCount'), + 'commentCount': lambda x: x.get('commentCount'), + 'tagCount': lambda x: x.get('tagCount'), + 'likesArePublic': lambda x: x.get('likesArePublic'), + } + }, } diff --git a/socid_extractor/utils.py b/socid_extractor/utils.py index 2aadc37..0ad237c 100644 --- a/socid_extractor/utils.py +++ b/socid_extractor/utils.py @@ -4,7 +4,6 @@ from datetime import datetime from http.cookies import SimpleCookie - def import_cookiejar(filename): from http.cookiejar import MozillaCookieJar cookies_obj = MozillaCookieJar(filename) @@ -59,15 +58,18 @@ def enrich_link(html_url): fixed_url = 'https://' + fixed_url return fixed_url - +# support timestamp with milliseconds +# coming to common UTC timezone with print it def timestamp_to_datetime(t): if not t: return '' elif len(str(t)) < 10: - t = math.floor(datetime.today().timestamp()) - t - - return datetime.fromtimestamp(int(t)) - + t = math.floor(datetime.today().timestamp()) - t + elif len(str(t)) == 13: + return datetime.fromtimestamp(float(t)/ 1000.0, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S.{} %Z'.format(str(t)[-3:])) + + return datetime.fromtimestamp(int(t), tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S %Z') + def extract_digits(text): digits_re = re.search(r'\d+', text) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 850c388..c27ca45 100755 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -1063,3 +1063,82 @@ def test_ifunny(): assert int(info.get("smile_count")) > 32000 assert int(info.get("achievement_count")) >= 1 assert info.get("is_verified") == "False" + + +def test_wattpad(): + info = extract(parse('https://wattpad.com/user/JeniferBalanzar')[0]) + + assert info.get("username") == "JeniferBalanzar" + assert info.get("fullname") == "Jenifer Balanzar" + assert info.get("image") == "https://img.wattpad.com/useravatar/JeniferBalanzar.128.615375.jpg" + assert info.get("image_bg") == "https://img.wattpad.com/userbg/JeniferBalanzar.36464.jpg" + assert info.get("gender") == "Female" + assert info.get("locale") == "es_MX" + assert int(info.get("follower_count")) >= 266 + assert int(info.get("following_count")) >= 89 + assert info.get("created_at") == "2019-12-10T00:25:02Z" + assert info.get("updated_at") == "2020-09-08T08:24:38Z" + assert info.get("verified") == "False" + assert info.get("verified_email") == "False" + + +def test_kik(): + info = extract(parse('http://kik.me/mksyx')[0]) + + assert info.get("fullname") == "experience true satisfaction" + assert info.get("image") == "http://profilepics.cf.kik.com/QUwticPE8XU7qm7qrTXbWgCfSu4/orig.jpg" + + +def test_dockerub(): + info = extract(parse('https://hub.docker.com/u/adastra2ankudinov')[0]) + + assert info.get("uid") == "b4f92258ad95428ea88ba498a883b40a" + assert info.get("username") == "adastra2ankudinov" + assert info.get("type") == "User" + assert info.get("image") == "https://secure.gravatar.com/avatar/410bf05a8e85652a6b174d627dce4e3d.jpg?s=80&r=g&d=mm" + + +def test_mixcloud(): + info = extract(parse('https://www.mixcloud.com/savath69/')[0]) + + assert info.get("username") == "savath69" + assert info.get("country") == "France" + assert info.get("city") == "Lyon" + assert info.get("created_at") == "2017-08-06T11:41:02Z" + assert info.get("updated_at") == "2017-08-06T11:41:02Z" + assert info.get("image") == "https://thumbnailer.mixcloud.com/unsafe/640x640/profile/d/1/c/a/0f1c-60ec-4f2c-9b04-5a9536c96d51" + assert int(info.get("follower_count")) >= 25 + assert int(info.get("following_count")) >= 6 + assert int(info.get("cloudcast_count")) >= 5 + assert int(info.get("favorite_count")) >= 0 + assert int(info.get("listen_count")) >= 15 + assert info.get("is_pro") == "False" + assert info.get("is_premium") == "False" + + +def test_binarysearch(): + info = extract(parse('https://ifunny.co/user/CuddleKinnz')[0]) + + assert int(info.get("uid")) >= 10435 + assert info.get("username") == "LarryNY" + assert info.get("image") == "https://binarysearch.s3-us-west-2.amazonaws.com/LarryNY?hash=1599781403401" + assert info.get("location") == "New York, NY, USA" + assert info.get("bio") == "This is fun." + assert info.get("links") == "https://www.youtube.com/c/Algorithmist/" + assert info.get("isAdmin") == "False" + assert info.get("isVerified") == "True" + assert info.get("HistoryPublic") == "False" + assert info.get("RoomPublic") == "True" + assert info.get("InviteOnly") == "False" + + +def test_pr0gramm(): + info = extract(parse('https://pr0gramm.com/user/TheBorderCrash')[0]) + + assert int(info.get("uid")) >= 323469 + assert info.get("username") == "TheBorderCrash" + assert int(info.get("uploadCount")) >= 5859 + assert int(info.get("commentCount")) >= 2497 + assert int(info.get("tagCount")) >= 22900 + assert info.get("likesArePublic") == "False" +