Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Collection of Comp Sci Bibliographies fetcher #6664

Merged
merged 18 commits into from
Jul 8, 2020
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve

### Added

- We added a new fetcher to enable users to search "[Collection of Computer Science Bibliographies](https://liinwww.ira.uka.de/bibliography/index.html)". [#6638](https://github.com/JabRef/jabref/issues/6638)
- We added default values for delimiters in Add Subgroup window [#6624](https://github.com/JabRef/jabref/issues/6624)
- We improved responsiveness of general fields specification dialog window. [#6643](https://github.com/JabRef/jabref/issues/6604)
- We added support for importing ris file and load DOI [#6530](https://github.com/JabRef/jabref/issues/6530)
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/jabref/logic/importer/WebFetchers.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.jabref.logic.importer.fetcher.ArXiv;
import org.jabref.logic.importer.fetcher.AstrophysicsDataSystem;
import org.jabref.logic.importer.fetcher.CiteSeer;
import org.jabref.logic.importer.fetcher.CollectionOfComputerScienceBibliographiesFetcher;
import org.jabref.logic.importer.fetcher.CompositeSearchBasedFetcher;
import org.jabref.logic.importer.fetcher.CrossRef;
import org.jabref.logic.importer.fetcher.DBLPFetcher;
Expand Down Expand Up @@ -101,6 +102,7 @@ public static SortedSet<SearchBasedFetcher> getSearchBasedFetchers(ImportFormatP
set.add(new DOAJFetcher(importFormatPreferences));
set.add(new IEEE(importFormatPreferences));
set.add(new CompositeSearchBasedFetcher(set, 30));
set.add(new CollectionOfComputerScienceBibliographiesFetcher(importFormatPreferences));
return set;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package org.jabref.logic.importer.fetcher;

import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;

import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.SearchBasedParserFetcher;

import org.apache.http.client.utils.URIBuilder;

public class CollectionOfComputerScienceBibliographiesFetcher implements SearchBasedParserFetcher {

private static final String BASIC_SEARCH_URL = "http://liinwww.ira.uka.de/bibliography/rss?";
daniel-price marked this conversation as resolved.
Show resolved Hide resolved

private final CollectionOfComputerScienceBibliographiesParser parser;

public CollectionOfComputerScienceBibliographiesFetcher(ImportFormatPreferences importFormatPreferences) {
this.parser = new CollectionOfComputerScienceBibliographiesParser(importFormatPreferences);
}

@Override
public URL getURLForQuery(String query) throws URISyntaxException, MalformedURLException, FetcherException {
return new URIBuilder(BASIC_SEARCH_URL)
.addParameter("query", query)
.addParameter("sort", "score")
.build()
.toURL();
}

@Override
public Parser getParser() {
return parser;
}

@Override
public String getName() {
return "Collection of Computer Science Bibliographies";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package org.jabref.logic.importer.fetcher;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.jabref.logic.formatter.bibtexfields.HtmlToUnicodeFormatter;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.fileformat.BibtexParser;
import org.jabref.logic.net.URLDownload;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.util.DummyFileUpdateMonitor;

public class CollectionOfComputerScienceBibliographiesParser implements Parser {

final static Pattern REGEX_FOR_LINKS = Pattern.compile("<item>[\\s\\S]*?<link>([\\s\\S]*?)<\\/link>[\\s\\S]*?<\\/item>");
final static Pattern REGEX_FOR_BIBTEX = Pattern.compile("<pre class=\"bibtex\">([\\s\\S]*?)<\\/pre>");

final BibtexParser bibtexParser;
final HtmlToUnicodeFormatter htmlToUnicodeFormatter;

public CollectionOfComputerScienceBibliographiesParser(ImportFormatPreferences importFormatPreferences) {
this.bibtexParser = new BibtexParser(importFormatPreferences, new DummyFileUpdateMonitor());
this.htmlToUnicodeFormatter = new HtmlToUnicodeFormatter();
}

@Override
public List<BibEntry> parseEntries(InputStream inputStream) throws ParseException {
try {
List<String> links = matchRegexFromInputStreamHtml(inputStream, REGEX_FOR_LINKS);
String bibtexDataString = parseBibtexStringsFromLinks(links)
.stream()
.collect(Collectors.joining());

return bibtexParser.parseEntries(bibtexDataString);
} catch (IOException e) {
throw new ParseException(e);
}
}

private List<String> matchRegexFromInputStreamHtml(InputStream inputStream, Pattern pattern) {
try (Scanner scanner = new Scanner(inputStream)) {
return scanner.findAll(pattern)
.map(match -> htmlToUnicodeFormatter.format(match.group(1)))
.collect(Collectors.toList());
}
}

private List<String> parseBibtexStringsFromLinks(List<String> links) throws IOException {
List<String> bibtexStringsFromAllLinks = new ArrayList();
for (String link : links) {
try (InputStream inputStream = new URLDownload(link).asInputStream()) {
List<String> bibtexStringsFromLink = matchRegexFromInputStreamHtml(inputStream, REGEX_FOR_BIBTEX);
bibtexStringsFromAllLinks.addAll(bibtexStringsFromLink);
}
}

return bibtexStringsFromAllLinks;
}
}

4 changes: 4 additions & 0 deletions src/test/java/org/jabref/logic/importer/WebFetchersTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import java.util.Set;
import java.util.stream.Collectors;

import org.jabref.logic.bibtex.FieldContentFormatterPreferences;
import org.jabref.logic.importer.fetcher.ACMPortalFetcher;
import org.jabref.logic.importer.fetcher.AbstractIsbnFetcher;
import org.jabref.logic.importer.fetcher.GrobidCitationFetcher;
Expand All @@ -20,6 +21,7 @@

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

class WebFetchersTest {

Expand All @@ -29,6 +31,8 @@ class WebFetchersTest {
@BeforeEach
void setUp() throws Exception {
importFormatPreferences = mock(ImportFormatPreferences.class);
FieldContentFormatterPreferences fieldContentFormatterPreferences = mock(FieldContentFormatterPreferences.class);
when(importFormatPreferences.getFieldContentFormatterPreferences()).thenReturn(fieldContentFormatterPreferences);
}

@Test
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package org.jabref.logic.importer.fetcher;

import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Collections;
import java.util.List;

import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.testutils.category.FetcherTest;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Answers;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

@FetcherTest
class CollectionOfComputerScienceBibliographiesFetcherTest {
daniel-price marked this conversation as resolved.
Show resolved Hide resolved
private CollectionOfComputerScienceBibliographiesFetcher fetcher;

@BeforeEach
public void setUp() {
ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS);
when(importFormatPreferences.getKeywordSeparator()).thenReturn(',');
fetcher = new CollectionOfComputerScienceBibliographiesFetcher(importFormatPreferences);
}

@Test
public void getNameReturnsCorrectName() {
assertEquals("Collection of Computer Science Bibliographies", fetcher.getName());
}

@Test
public void getUrlForQueryReturnsCorrectUrl() throws MalformedURLException, URISyntaxException, FetcherException {
String query = "java jdk";
URL url = fetcher.getURLForQuery(query);
assertEquals("http://liinwww.ira.uka.de/bibliography/rss?query=java+jdk&sort=score", url.toString());
}

@Test
public void performSearchReturnsMatchingMultipleEntries() throws FetcherException {
List<BibEntry> searchResult = fetcher.performSearch("jabref");
BibEntry bibEntry = searchResult.get(0);
assertNotNull(bibEntry.getField(StandardField.ABSTRACT));
daniel-price marked this conversation as resolved.
Show resolved Hide resolved
assertNotNull(bibEntry.getField(StandardField.AUTHOR));
assertNotNull(bibEntry.getField(StandardField.URL));
assertNotNull(bibEntry.getField(StandardField.YEAR));
assertNotNull(bibEntry.getField(StandardField.TITLE));
assertNotNull(bibEntry.getField(StandardField.TYPE));
}

@Test
public void performSearchReturnsEmptyListForEmptySearch() throws FetcherException {
List<BibEntry> searchResult = fetcher.performSearch("");
assertEquals(Collections.emptyList(), searchResult);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package org.jabref.logic.importer.fetcher;

import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import org.jabref.logic.bibtex.BibEntryAssert;
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.model.entry.BibEntry;
import org.jabref.testutils.category.FetcherTest;

import org.junit.jupiter.api.Test;
import org.mockito.Answers;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

@FetcherTest
public class CollectionOfComputerScienceBibliographiesParserTest {
@Test
public void parseEntriesReturnsEmptyListIfXmlHasNoResults() throws Exception {
parseXmlAndCheckResults("collection_of_computer_science_bibliographies_empty_result.xml", Collections.emptyList());
}

@Test
public void parseEntriesReturnsOneBibEntryInListIfXmlHasSingleResult() throws Exception {
parseXmlAndCheckResults("collection_of_computer_science_bibliographies_single_result.xml", Collections.singletonList("collection_of_computer_science_bibliographies_single_result.bib"));
}

@Test
public void parseEntriesReturnsMultipleBibEntriesInListIfXmlHasMultipleResults() throws Exception {
parseXmlAndCheckResults("collection_of_computer_science_bibliographies_multiple_results.xml", Arrays.asList("collection_of_computer_science_bibliographies_multiple_results_first_result.bib", "collection_of_computer_science_bibliographies_multiple_results_second_result.bib"));
}

private void parseXmlAndCheckResults(String xmlName, List<String> resourceNames) throws Exception {
ImportFormatPreferences importFormatPreferences = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS);
when(importFormatPreferences.getKeywordSeparator()).thenReturn(',');

InputStream is = CollectionOfComputerScienceBibliographiesParserTest.class.getResourceAsStream(xmlName);
CollectionOfComputerScienceBibliographiesParser parser = new CollectionOfComputerScienceBibliographiesParser(importFormatPreferences);
List<BibEntry> entries = parser.parseEntries(is);
assertNotNull(entries);
assertEquals(resourceNames.size(), entries.size());
for (int i = 0; i < resourceNames.size(); i++) {
BibEntryAssert.assertEquals(GvkParserTest.class, resourceNames.get(i), entries.get(i));
daniel-price marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="http://liinwww.ira.uka.de/bibliography/rss.xsl"?><rss version="2.0">

<channel xmlns:dc="http://purl.org/dc/elements/1.1/">

<title>CCSB: "test string which returns no results"</title>
<link>http://liinwww.ira.uka.de/bibliography/#search</link>
<description>Search results in The Collection of Computer Science Bibliographies for query: "test string which returns no results"</description>
<language>en</language>
daniel-price marked this conversation as resolved.
Show resolved Hide resolved
<copyright>The data is available for noncommercial or private use only, harvesting is prohibited (the data may be obtained using other means and not this RSS feed).</copyright>
<webMaster>[email protected]</webMaster>
<lastBuildDate>Mon, 09 Mar 2020 03:14:28 +0100</lastBuildDate>
<ttl>5760</ttl>

</channel>
</rss>
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet title="XSL_formatting" type="text/xsl" href="http://liinwww.ira.uka.de/bibliography/rss.xsl"?><rss version="2.0">

<channel xmlns:dc="http://purl.org/dc/elements/1.1/">

<title>CCSB: +"effective java" +"joshua bloch" +"java series"</title>
<link>http://liinwww.ira.uka.de/bibliography/#search</link>
<description>Search results in The Collection of Computer Science Bibliographies for query: +"effective java" +"joshua bloch" +"java series"</description>
<language>en</language>
daniel-price marked this conversation as resolved.
Show resolved Hide resolved
<copyright>The data is available for noncommercial or private use only, harvesting is prohibited (the data may be obtained using other means and not this RSS feed).</copyright>
<webMaster>[email protected]</webMaster>
<lastBuildDate>Mon, 09 Mar 2020 03:14:28 +0100</lastBuildDate>
<ttl>5760</ttl>

<item>
<description>
<p>
Author: Joshua Bloch;
<br/>
Title: Effective Java: Programming Language Guide;
<br/>
Year: 2001;
<br/>
Abstract available;
<br/>
4 records for this title/author combination available.
</p>
</description>

<dc:title>Effective Java: Programming Language Guide</dc:title>

<dc:date>2001</dc:date>

<dc:creator>Joshua Bloch</dc:creator>

<title>[2001] Effective Java: Programming Language Guide (by: Joshua Bloch)</title>

</item>

<item>
<link>http://liinwww.ira.uka.de/searchbib/index?query=hpdtjrpbcpgllljdctmdkfnhqdcnrkkc&amp;results=bibtex&amp;mode=dup&amp;rss=1</link>

<dc:creator>Joshua Bloch</dc:creator>

<dc:title>Effective Java</dc:title>

<description>
<p>
Author: Joshua Bloch;
<br/>
Title: Effective Java;
<br/>
Year: 2001;
<br/>
URLs available (possible fulltext access);
<br/>
2 records for this title/author combination available.
</p>
</description>

<title>[2001] Effective Java (by: Joshua Bloch)</title>

</item>

</channel>
</rss>
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
@book{Bloch:2008:EJ,
acknowledgement = {Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|[email protected]|, \path|[email protected]|, \path|[email protected]| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|},
address = {pub-AW:adr},
author = {Joshua Bloch},
bibdate = {Tue Jan 27 16:10:46 MST 2009},
bibsource = {http://www.math.utah.edu/pub/tex/bib/java2000.bib; z3950.loc.gov:7090/Voyager},
edition = {Second},
isbn = {0-321-35668-3 (paperback)},
isbn-13 = {978-0-321-35668-0 (paperback)},
lccn = {QA76.73.J38 B57 2008},
pages = {xxi + 346},
publisher = {Ad{\-d}i{\-s}on-Wes{\-l}ey},
remark = {Revised and updated for Java SE 6.},
series = {The Java series},
subject = {Java (Computer program language)},
tableofcontents = {Introduction \\ Creating and destroying objects \\ Methods common to all objects \\ Classes and interfaces \\ Generics \\ Enums and annotations \\ Methods \\ General programming \\ Exceptions \\ Concurrency \\ Serialization},
title = {Effective {Java}},
url = {http://www.loc.gov/catdir/toc/fy0805/2008926278.html},
year = {2008}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
@book{Bloch2001,
added-at = {Tue Sep 16 09:14:19 2003},
added-by = {msteiner},
annote = {Good book on various rules \& conventions (e.g., what not to do and what instead) to use Java effectively},
author = {Joshua Bloch},
publisher = {Addison-Wesley},
series = {The Java Series},
title = {Effective Java},
year = {2001}
}
Loading