Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add "split" xpath in post-processing , newlines in replace support #579

Merged
merged 9 commits into from
Jun 18, 2020
68 changes: 64 additions & 4 deletions pkg/scraper/xpath.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ func (c xpathRegexConfig) apply(value string) string {
}

ret := re.ReplaceAllString(value, with)
if with == "\n" {
ret = replaceLines(ret)
}

logger.Debugf(`Replace: '%s' with '%s'`, regex, with)
logger.Debugf("Before: %s", value)
Expand All @@ -94,6 +97,8 @@ func (c xpathRegexConfigs) apply(value string) string {
// remove whitespace again
value = commonPostProcess(value)

// restore replaced lines
value = restoreLines(value)
return value
}

Expand Down Expand Up @@ -129,6 +134,15 @@ func (c xpathScraperAttrConfig) getParseDate() string {
return c.getString(parseDateKey)
}

func (c xpathScraperAttrConfig) getSplit() string {
const splitKey = "split"
return c.getString(splitKey)
}

func (c xpathScraperAttrConfig) hasSplit() bool {
return c.getSplit() != ""
}

func (c xpathScraperAttrConfig) getReplace() xpathRegexConfigs {
const replaceKey = "replace"
val, _ := c[replaceKey]
Expand Down Expand Up @@ -198,6 +212,23 @@ func (c xpathScraperAttrConfig) parseDate(value string) string {
return parsedValue.Format(internalDateFormat)
}

func (c xpathScraperAttrConfig) splitString(value string) []string {
separator := c.getSplit()
var res []string

if separator == "" {
return []string{value}
}

for _, str := range strings.Split(value, separator) {
if str != "" {
res = append(res, str)
}
}

return res
}

func (c xpathScraperAttrConfig) replaceRegex(value string) string {
replace := c.getReplace()
return replace.apply(value)
Expand Down Expand Up @@ -258,6 +289,24 @@ func commonPostProcess(value string) string {
return value
}

// func replaceLines replaces all newlines ("\n") with carriage returns ("\r")
func replaceLines(value string) string {
re := regexp.MustCompile("\r") // carriage returns shouldn't exist in the string
value = re.ReplaceAllString(value, "") // remove them
re = regexp.MustCompile("\n") // replace newlines with CR's so that they don't get removed by commonPostprocess
value = re.ReplaceAllString(value, "\r")

return value
}

// func restoreLines replaces all carriage returns ("\r") with newlines ("\n")
func restoreLines(value string) string {
re := regexp.MustCompile("\r")
value = re.ReplaceAllString(value, "\n")

return value
}

func runXPathQuery(doc *html.Node, xpath string, common commonXPathConfig) []*html.Node {
// apply common
if common != nil {
Expand Down Expand Up @@ -299,15 +348,26 @@ func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) xP
if attrConfig.hasConcat() {
result := attrConfig.concatenateResults(found)
result = attrConfig.postProcess(result)
const i = 0
ret = ret.setKey(i, k, result)
if attrConfig.hasSplit() {
for j, txt := range attrConfig.splitString(result) {
ret = ret.setKey(j, k, txt)
}
} else {
const i = 0
ret = ret.setKey(i, k, result)
}
bnkai marked this conversation as resolved.
Show resolved Hide resolved
} else {
for i, elem := range found {
text := NodeText(elem)
text = commonPostProcess(text)
text = attrConfig.postProcess(text)

ret = ret.setKey(i, k, text)
if attrConfig.hasSplit() {
for j, txt := range attrConfig.splitString(text) {
ret = ret.setKey(i+j, k, txt)
}
} else {
ret = ret.setKey(i, k, text)
}
}
}
}
Expand Down
38 changes: 38 additions & 0 deletions pkg/scraper/xpath_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,14 @@ func makeSceneXPathConfig() xpathScraper {
studioConfig["URL"] = `$studioElem/@href`
config["Studio"] = studioConfig

const sep = " "
moviesNameConfig := make(map[interface{}]interface{})
moviesNameConfig["selector"] = `//i[@class="isMe tooltipTrig"]/@data-title`
moviesNameConfig["split"] = sep
moviesConfig := make(map[interface{}]interface{})
moviesConfig["Name"] = moviesNameConfig
config["Movies"] = moviesConfig

scraper := xpathScraper{
Scene: config,
Common: common,
Expand Down Expand Up @@ -692,6 +700,27 @@ func verifyTags(t *testing.T, expectedTagNames []string, actualTags []*models.Sc
}
}

func verifyMovies(t *testing.T, expectedMovieNames []string, actualMovies []*models.ScrapedSceneMovie) {
t.Helper()

i := 0
for i < len(expectedMovieNames) || i < len(actualMovies) {
expectedMovie := ""
actualMovie := ""
if i < len(expectedMovieNames) {
expectedMovie = expectedMovieNames[i]
}
if i < len(actualMovies) {
actualMovie = actualMovies[i].Name
}

if expectedMovie != actualMovie {
t.Errorf("Expected movie %s, got %s", expectedMovie, actualMovie)
}
i++
}
}

func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []string, actualPerformers []*models.ScrapedScenePerformer) {
t.Helper()

Expand Down Expand Up @@ -761,6 +790,15 @@ func TestApplySceneXPathConfig(t *testing.T) {
}
verifyTags(t, expectedTags, scene.Tags)

// verify movies
expectedMovies := []string{
"Video",
"of",
"verified",
"member",
}
verifyMovies(t, expectedMovies, scene.Movies)

expectedPerformerNames := []string{
"Alex D",
"Mia Malkova",
Expand Down
4 changes: 4 additions & 0 deletions ui/v2.5/src/index.scss
Original file line number Diff line number Diff line change
Expand Up @@ -483,3 +483,7 @@ div.dropdown-menu {
text-transform: uppercase;
}
}

.pre {
white-space: pre-line;
}