Skip to content

Commit

Permalink
Init commit
Browse files Browse the repository at this point in the history
Importer for the TDT dataset
  • Loading branch information
Midas-M committed Nov 27, 2015
0 parents commit bedac56
Show file tree
Hide file tree
Showing 23 changed files with 1,913 additions and 0 deletions.
37 changes: 37 additions & 0 deletions .classpath
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="output" path="target/classes"/>
</classpath>
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/target/
23 changes: 23 additions & 0 deletions .project
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>MatchTheNewsCore</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>
</projectDescription>
5 changes: 5 additions & 0 deletions .settings/org.eclipse.jdt.core.prefs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
org.eclipse.jdt.core.compiler.compliance=1.8
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.source=1.8
4 changes: 4 additions & 0 deletions .settings/org.eclipse.m2e.core.prefs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1
70 changes: 70 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>gr.aueb</groupId>
<artifactId>CommonDataImporters</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>CommonDataImporters</name>

<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>

<dependencies>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
<version>3.3</version>
</dependency>

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>4.10.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.11</version>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.10.3</version>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>4.10.3</version>
</dependency>


<dependency>
<groupId>gov.nist.math</groupId>
<artifactId>jama</artifactId>
<version>1.0.3</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.3.2</version>
</dependency>

</dependencies>

</project>
13 changes: 13 additions & 0 deletions properties.prop
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
rTOPICS_BY_LANGUAGE_FILEPATH=/Datasets/TDT/tdt5_topic_annot/docs/TDT2004.topics_by_language
TOPICS_DESCRIPTION_FILEPATH=/Datasets/TDT/data/tdt5_topic_annot/docs/TDT2004-topic_profiles.html
LINK_DETECTION_RESULT_FOLDER=/TDT/Results/LinkDetection/
TOPICS_RELEVANCE_FILEPATH=/Datasets/TDT/tdt5_topic_annot/data/annotations/topic_relevance/TDT2004.topic_rel.v2.0
LINK_DETECTION_FILEPATH=/Datasets/TDT/tdt5_topic_annot/data/annotations/link_detection/lnk_SR=nwt_TE=mul,eng.key
DOCUMENT_FOLDER_IMPORT_GOOGLE_NEWS=/GoogleNews/
DOCUMENT_FOLDER_IMPORT_GOOGLE_NEWS_TRAIN=/GoogleNews/Train_Test_Data/Train.txt
DOCUMENT_FOLDER_IMPORT_GOOGLE_NEWS_TEST=/GoogleNews/Train_Test_Data/Test.txt
DOCUMENT_FOLDER_IMPORT_TKN=/Datasets/TDT/tkn_sgm
DOCUMENT_FOLDER_IMPORT_MTTKN=/Datasets/TDT/mttkn_sgm
USE_PORTER_STEMMER=true
ELIMINATE_STOPWORDS=true
READ_ONLY_ENGLISH_DOCUMENTS=true
105 changes: 105 additions & 0 deletions src/main/java/gr/aueb/dbnet/importers/GoogleNewsImporter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package gr.aueb.dbnet.importers;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.commons.math3.random.MersenneTwister;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import gr.aueb.dbnet.tdt.structures.Document;
import gr.aueb.dbnet.util.SystemProperties;

public class GoogleNewsImporter extends Importer {

private ConcurrentHashMap<String,Document> documents_google;
private ConcurrentHashMap<String, Document> trainDocuments;
private ConcurrentHashMap<String, Document> testDocuments;

@Override
public void importData() throws IOException{

documents_google = new ConcurrentHashMap<String, Document>();
String data_path = System.getProperty("user.home")+SystemProperties.DOCUMENT_FOLDER_IMPORT_GOOGLE_NEWS;
File folder = new File(data_path);

for (File fileEntry : folder.listFiles()) {
if(fileEntry.isDirectory())
continue;
System.out.println(fileEntry.getName());
try {
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(fileEntry));
HSSFWorkbook wb = new HSSFWorkbook(fs);
HSSFSheet sheet = wb.getSheetAt(0);
HSSFRow row;
HSSFCell cell;

int rows; // No of rows
rows = sheet.getPhysicalNumberOfRows();

int cols = 0; // No of columns
int tmp = 0;

// This trick ensures that we get the data properly even if it doesn't start from first few rows
for(int i = 0; i < 10 || i < rows; i++) {
row = sheet.getRow(i);
if(row != null) {
tmp = sheet.getRow(i).getPhysicalNumberOfCells();
if(tmp > cols) cols = tmp;
}
}

for(int r = 1; r < rows; r++) {
row = sheet.getRow(r);
if(row != null) {
//TODO
cell= row.getCell(0);
String id =cell.toString();
cell=row.getCell(7);
String text = cell.toString();
//cell=row.getCell(6);
//String cluster=cell.toString();
String cluster=fileEntry.getName().split("\\.")[0].split("oct_")[1];
documents_google.put(id, new Document(id, text, cluster));
}

}
} catch(Exception ioe) {
ioe.printStackTrace();
}
}
}


@Override
public Map<String, ? extends Document> getTrainDocuments() {
return trainDocuments;
}

@Override
public Map<String, ? extends Document> getTestDocuments() {
return testDocuments;
}


@Override
public Map<String, ? extends Document> getData() {
// TODO Auto-generated method stub

return documents_google;
}



}
18 changes: 18 additions & 0 deletions src/main/java/gr/aueb/dbnet/importers/Importer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package gr.aueb.dbnet.importers;

import gr.aueb.dbnet.tdt.structures.Document;

import java.io.IOException;
import java.util.Map;

public abstract class Importer {

public void importData() throws IOException {
}

public abstract Map<String, ? extends Document> getTrainDocuments() ;

public abstract Map<String, ? extends Document> getTestDocuments() ;
public abstract Map<String, ? extends Document> getData() ;

}
Loading

0 comments on commit bedac56

Please sign in to comment.