Skip to content

Commit

Permalink
Add sanitize_string function
Browse files Browse the repository at this point in the history
  • Loading branch information
toddhow committed Dec 30, 2023
1 parent d0c1ef8 commit c16f718
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
.vscode
.vscode
tests
21 changes: 20 additions & 1 deletion fanscrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import uuid
from typing import Dict
from datetime import datetime
from bs4 import BeautifulSoup

try:
from stashapi import log
Expand Down Expand Up @@ -445,7 +446,7 @@ def process_row(row, username, network, scene_index=0, scene_count=0):
res = {}
res['date'] = date.strftime("%Y-%m-%d")
res['title'] = format_title(row[1], username, res['date'], scene_index, scene_count)
res['details'] = row[1]
res['details'] = sanitize_string(row[1])
res['code'] = str(row[0])
if network == 'OnlyFans':
res['urls'] = [f"https://onlyfans.com/{res['code']}/{username}"]
Expand Down Expand Up @@ -517,6 +518,24 @@ def sanitize_api_type(api_type):
if api_type in bad_types:
api_type = bad_types[api_type]
return api_type

def sanitize_string(string):
"""
Parses and sanitizes strings to remove HTML tags
"""
if string:
try:
import lxml as unused_lxml_

html_parser = "lxml"
except ImportError:
html_parser = "html.parser"

string = re.sub("<[^>]*>", "", string)
string = " ".join(string.split())
string = BeautifulSoup(string, html_parser).get_text()
return string
return string


# MAIN #############################################################################################
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
stashapp-tools
sqlite3
sqlite3
bs4

0 comments on commit c16f718

Please sign in to comment.