-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Importer for the TDT dataset
- Loading branch information
0 parents
commit bedac56
Showing
23 changed files
with
1,913 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<classpath> | ||
<classpathentry kind="src" output="target/classes" path="src/main/java"> | ||
<attributes> | ||
<attribute name="optional" value="true"/> | ||
<attribute name="maven.pomderived" value="true"/> | ||
</attributes> | ||
</classpathentry> | ||
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources"> | ||
<attributes> | ||
<attribute name="maven.pomderived" value="true"/> | ||
</attributes> | ||
</classpathentry> | ||
<classpathentry kind="src" output="target/test-classes" path="src/test/java"> | ||
<attributes> | ||
<attribute name="optional" value="true"/> | ||
<attribute name="maven.pomderived" value="true"/> | ||
</attributes> | ||
</classpathentry> | ||
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources"> | ||
<attributes> | ||
<attribute name="maven.pomderived" value="true"/> | ||
</attributes> | ||
</classpathentry> | ||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"> | ||
<attributes> | ||
<attribute name="maven.pomderived" value="true"/> | ||
</attributes> | ||
</classpathentry> | ||
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER"> | ||
<attributes> | ||
<attribute name="maven.pomderived" value="true"/> | ||
</attributes> | ||
</classpathentry> | ||
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/> | ||
<classpathentry kind="output" path="target/classes"/> | ||
</classpath> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
/target/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<projectDescription> | ||
<name>MatchTheNewsCore</name> | ||
<comment></comment> | ||
<projects> | ||
</projects> | ||
<buildSpec> | ||
<buildCommand> | ||
<name>org.eclipse.jdt.core.javabuilder</name> | ||
<arguments> | ||
</arguments> | ||
</buildCommand> | ||
<buildCommand> | ||
<name>org.eclipse.m2e.core.maven2Builder</name> | ||
<arguments> | ||
</arguments> | ||
</buildCommand> | ||
</buildSpec> | ||
<natures> | ||
<nature>org.eclipse.jdt.core.javanature</nature> | ||
<nature>org.eclipse.m2e.core.maven2Nature</nature> | ||
</natures> | ||
</projectDescription> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
eclipse.preferences.version=1 | ||
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 | ||
org.eclipse.jdt.core.compiler.compliance=1.8 | ||
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning | ||
org.eclipse.jdt.core.compiler.source=1.8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
activeProfiles= | ||
eclipse.preferences.version=1 | ||
resolveWorkspaceProjects=true | ||
version=1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<groupId>gr.aueb</groupId> | ||
<artifactId>CommonDataImporters</artifactId> | ||
<version>0.0.1-SNAPSHOT</version> | ||
<name>CommonDataImporters</name> | ||
|
||
<build> | ||
<plugins> | ||
<plugin> | ||
<artifactId>maven-compiler-plugin</artifactId> | ||
<version>3.1</version> | ||
<configuration> | ||
<source>1.8</source> | ||
<target>1.8</target> | ||
</configuration> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.apache.commons</groupId> | ||
<artifactId>commons-math3</artifactId> | ||
<version>3.3</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.jsoup</groupId> | ||
<artifactId>jsoup</artifactId> | ||
<version>1.8.1</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-core</artifactId> | ||
<version>4.10.3</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.poi</groupId> | ||
<artifactId>poi</artifactId> | ||
<version>3.11</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-analyzers-common</artifactId> | ||
<version>4.10.3</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-queryparser</artifactId> | ||
<version>4.10.3</version> | ||
</dependency> | ||
|
||
|
||
<dependency> | ||
<groupId>gov.nist.math</groupId> | ||
<artifactId>jama</artifactId> | ||
<version>1.0.3</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.commons</groupId> | ||
<artifactId>commons-lang3</artifactId> | ||
<version>3.3.2</version> | ||
</dependency> | ||
|
||
</dependencies> | ||
|
||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
rTOPICS_BY_LANGUAGE_FILEPATH=/Datasets/TDT/tdt5_topic_annot/docs/TDT2004.topics_by_language | ||
TOPICS_DESCRIPTION_FILEPATH=/Datasets/TDT/data/tdt5_topic_annot/docs/TDT2004-topic_profiles.html | ||
LINK_DETECTION_RESULT_FOLDER=/TDT/Results/LinkDetection/ | ||
TOPICS_RELEVANCE_FILEPATH=/Datasets/TDT/tdt5_topic_annot/data/annotations/topic_relevance/TDT2004.topic_rel.v2.0 | ||
LINK_DETECTION_FILEPATH=/Datasets/TDT/tdt5_topic_annot/data/annotations/link_detection/lnk_SR=nwt_TE=mul,eng.key | ||
DOCUMENT_FOLDER_IMPORT_GOOGLE_NEWS=/GoogleNews/ | ||
DOCUMENT_FOLDER_IMPORT_GOOGLE_NEWS_TRAIN=/GoogleNews/Train_Test_Data/Train.txt | ||
DOCUMENT_FOLDER_IMPORT_GOOGLE_NEWS_TEST=/GoogleNews/Train_Test_Data/Test.txt | ||
DOCUMENT_FOLDER_IMPORT_TKN=/Datasets/TDT/tkn_sgm | ||
DOCUMENT_FOLDER_IMPORT_MTTKN=/Datasets/TDT/mttkn_sgm | ||
USE_PORTER_STEMMER=true | ||
ELIMINATE_STOPWORDS=true | ||
READ_ONLY_ENGLISH_DOCUMENTS=true |
105 changes: 105 additions & 0 deletions
105
src/main/java/gr/aueb/dbnet/importers/GoogleNewsImporter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
package gr.aueb.dbnet.importers; | ||
|
||
import java.io.File; | ||
import java.io.FileInputStream; | ||
import java.io.IOException; | ||
import java.io.PrintWriter; | ||
import java.nio.file.Files; | ||
import java.nio.file.Paths; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Map.Entry; | ||
import java.util.concurrent.ConcurrentHashMap; | ||
|
||
import org.apache.commons.math3.random.MersenneTwister; | ||
import org.apache.poi.hssf.usermodel.HSSFCell; | ||
import org.apache.poi.hssf.usermodel.HSSFRow; | ||
import org.apache.poi.hssf.usermodel.HSSFSheet; | ||
import org.apache.poi.hssf.usermodel.HSSFWorkbook; | ||
import org.apache.poi.poifs.filesystem.POIFSFileSystem; | ||
|
||
import gr.aueb.dbnet.tdt.structures.Document; | ||
import gr.aueb.dbnet.util.SystemProperties; | ||
|
||
public class GoogleNewsImporter extends Importer { | ||
|
||
private ConcurrentHashMap<String,Document> documents_google; | ||
private ConcurrentHashMap<String, Document> trainDocuments; | ||
private ConcurrentHashMap<String, Document> testDocuments; | ||
|
||
@Override | ||
public void importData() throws IOException{ | ||
|
||
documents_google = new ConcurrentHashMap<String, Document>(); | ||
String data_path = System.getProperty("user.home")+SystemProperties.DOCUMENT_FOLDER_IMPORT_GOOGLE_NEWS; | ||
File folder = new File(data_path); | ||
|
||
for (File fileEntry : folder.listFiles()) { | ||
if(fileEntry.isDirectory()) | ||
continue; | ||
System.out.println(fileEntry.getName()); | ||
try { | ||
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(fileEntry)); | ||
HSSFWorkbook wb = new HSSFWorkbook(fs); | ||
HSSFSheet sheet = wb.getSheetAt(0); | ||
HSSFRow row; | ||
HSSFCell cell; | ||
|
||
int rows; // No of rows | ||
rows = sheet.getPhysicalNumberOfRows(); | ||
|
||
int cols = 0; // No of columns | ||
int tmp = 0; | ||
|
||
// This trick ensures that we get the data properly even if it doesn't start from first few rows | ||
for(int i = 0; i < 10 || i < rows; i++) { | ||
row = sheet.getRow(i); | ||
if(row != null) { | ||
tmp = sheet.getRow(i).getPhysicalNumberOfCells(); | ||
if(tmp > cols) cols = tmp; | ||
} | ||
} | ||
|
||
for(int r = 1; r < rows; r++) { | ||
row = sheet.getRow(r); | ||
if(row != null) { | ||
//TODO | ||
cell= row.getCell(0); | ||
String id =cell.toString(); | ||
cell=row.getCell(7); | ||
String text = cell.toString(); | ||
//cell=row.getCell(6); | ||
//String cluster=cell.toString(); | ||
String cluster=fileEntry.getName().split("\\.")[0].split("oct_")[1]; | ||
documents_google.put(id, new Document(id, text, cluster)); | ||
} | ||
|
||
} | ||
} catch(Exception ioe) { | ||
ioe.printStackTrace(); | ||
} | ||
} | ||
} | ||
|
||
|
||
@Override | ||
public Map<String, ? extends Document> getTrainDocuments() { | ||
return trainDocuments; | ||
} | ||
|
||
@Override | ||
public Map<String, ? extends Document> getTestDocuments() { | ||
return testDocuments; | ||
} | ||
|
||
|
||
@Override | ||
public Map<String, ? extends Document> getData() { | ||
// TODO Auto-generated method stub | ||
|
||
return documents_google; | ||
} | ||
|
||
|
||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
package gr.aueb.dbnet.importers; | ||
|
||
import gr.aueb.dbnet.tdt.structures.Document; | ||
|
||
import java.io.IOException; | ||
import java.util.Map; | ||
|
||
public abstract class Importer { | ||
|
||
public void importData() throws IOException { | ||
} | ||
|
||
public abstract Map<String, ? extends Document> getTrainDocuments() ; | ||
|
||
public abstract Map<String, ? extends Document> getTestDocuments() ; | ||
public abstract Map<String, ? extends Document> getData() ; | ||
|
||
} |
Oops, something went wrong.