Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add "split" xpath in post-processing , newlines in replace support #579

Merged
merged 9 commits into from
Jun 18, 2020
70 changes: 66 additions & 4 deletions pkg/scraper/xpath.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ func (c xpathRegexConfig) apply(value string) string {
}

ret := re.ReplaceAllString(value, with)
// replace lines if needed to protect from commonPostprocess
if with == "\n" {
ret = replaceLines(ret)
}

logger.Debugf(`Replace: '%s' with '%s'`, regex, with)
logger.Debugf("Before: %s", value)
Expand All @@ -94,6 +98,9 @@ func (c xpathRegexConfigs) apply(value string) string {
// remove whitespace again
value = commonPostProcess(value)

// restore replaced lines

value = restoreLines(value)
return value
}

Expand Down Expand Up @@ -129,6 +136,15 @@ func (c xpathScraperAttrConfig) getParseDate() string {
return c.getString(parseDateKey)
}

func (c xpathScraperAttrConfig) getSplit() string {
const splitKey = "split"
return c.getString(splitKey)
}

func (c xpathScraperAttrConfig) hasSplit() bool {
return c.getSplit() != ""
}

func (c xpathScraperAttrConfig) getReplace() xpathRegexConfigs {
const replaceKey = "replace"
val, _ := c[replaceKey]
Expand Down Expand Up @@ -198,6 +214,36 @@ func (c xpathScraperAttrConfig) parseDate(value string) string {
return parsedValue.Format(internalDateFormat)
}

func (c xpathScraperAttrConfig) splitString(value string) []string {
separator := c.getSplit()
var res []string

if separator == "" {
return []string{value}
}

for _, str := range strings.Split(value, separator) {
if str != "" {
res = append(res, str)
}
}

return res
}

// setKeyAndSplit sets the key "k" for the results "ret" and splits if needed
// "i" is the index starting position
func (c xpathScraperAttrConfig) setKeyAndSplit(ret *xPathResults, value string, k string, i int) {
if c.hasSplit() {
for j, txt := range c.splitString(value) {
*ret = ret.setKey(j+i, k, txt)
}
} else {
*ret = ret.setKey(i, k, value)
}

}

func (c xpathScraperAttrConfig) replaceRegex(value string) string {
replace := c.getReplace()
return replace.apply(value)
Expand Down Expand Up @@ -258,6 +304,24 @@ func commonPostProcess(value string) string {
return value
}

// func replaceLines replaces all newlines ("\n") with alert ("\a")
func replaceLines(value string) string {
re := regexp.MustCompile("\a") // \a shouldn't exist in the string
value = re.ReplaceAllString(value, "") // remove it
re = regexp.MustCompile("\n") // replace newlines with (\a)'s so that they don't get removed by commonPostprocess
value = re.ReplaceAllString(value, "\a")

return value
}

// func restoreLines replaces all alerts ("\a") with newlines ("\n")
func restoreLines(value string) string {
re := regexp.MustCompile("\a")
value = re.ReplaceAllString(value, "\n")

return value
}

func runXPathQuery(doc *html.Node, xpath string, common commonXPathConfig) []*html.Node {
// apply common
if common != nil {
Expand Down Expand Up @@ -299,15 +363,13 @@ func (s xpathScraperConfig) process(doc *html.Node, common commonXPathConfig) xP
if attrConfig.hasConcat() {
result := attrConfig.concatenateResults(found)
result = attrConfig.postProcess(result)
const i = 0
ret = ret.setKey(i, k, result)
attrConfig.setKeyAndSplit(&ret, result, k, 0)
} else {
for i, elem := range found {
text := NodeText(elem)
text = commonPostProcess(text)
text = attrConfig.postProcess(text)

ret = ret.setKey(i, k, text)
attrConfig.setKeyAndSplit(&ret, text, k, i)
}
}
}
Expand Down
38 changes: 38 additions & 0 deletions pkg/scraper/xpath_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,14 @@ func makeSceneXPathConfig() xpathScraper {
studioConfig["URL"] = `$studioElem/@href`
config["Studio"] = studioConfig

const sep = " "
moviesNameConfig := make(map[interface{}]interface{})
moviesNameConfig["selector"] = `//i[@class="isMe tooltipTrig"]/@data-title`
moviesNameConfig["split"] = sep
moviesConfig := make(map[interface{}]interface{})
moviesConfig["Name"] = moviesNameConfig
config["Movies"] = moviesConfig

scraper := xpathScraper{
Scene: config,
Common: common,
Expand Down Expand Up @@ -692,6 +700,27 @@ func verifyTags(t *testing.T, expectedTagNames []string, actualTags []*models.Sc
}
}

func verifyMovies(t *testing.T, expectedMovieNames []string, actualMovies []*models.ScrapedSceneMovie) {
t.Helper()

i := 0
for i < len(expectedMovieNames) || i < len(actualMovies) {
expectedMovie := ""
actualMovie := ""
if i < len(expectedMovieNames) {
expectedMovie = expectedMovieNames[i]
}
if i < len(actualMovies) {
actualMovie = actualMovies[i].Name
}

if expectedMovie != actualMovie {
t.Errorf("Expected movie %s, got %s", expectedMovie, actualMovie)
}
i++
}
}

func verifyPerformers(t *testing.T, expectedNames []string, expectedURLs []string, actualPerformers []*models.ScrapedScenePerformer) {
t.Helper()

Expand Down Expand Up @@ -761,6 +790,15 @@ func TestApplySceneXPathConfig(t *testing.T) {
}
verifyTags(t, expectedTags, scene.Tags)

// verify movies
expectedMovies := []string{
"Video",
"of",
"verified",
"member",
}
verifyMovies(t, expectedMovies, scene.Movies)

expectedPerformerNames := []string{
"Alex D",
"Mia Malkova",
Expand Down
1 change: 1 addition & 0 deletions ui/v2.5/src/components/Changelog/versions/v030.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ const markup = `
* Add support for parent/child studios.

### 🎨 Improvements
* Add split xpath post-processing action.
* Improved the layout of the scene page.
* Show rating as stars in scene page.
* Add reload scrapers button.
Expand Down
4 changes: 4 additions & 0 deletions ui/v2.5/src/index.scss
Original file line number Diff line number Diff line change
Expand Up @@ -488,3 +488,7 @@ div.dropdown-menu {
text-transform: uppercase;
}
}

.pre {
white-space: pre-line;
}