Skip to content

Commit

Permalink
added subclustering and new output template for this 2 level hierarchy
Browse files Browse the repository at this point in the history
  • Loading branch information
peichins committed Jul 28, 2017
1 parent 35be415 commit 6b7984e
Show file tree
Hide file tree
Showing 6 changed files with 471 additions and 5 deletions.
85 changes: 85 additions & 0 deletions AudioAnalysis/RCode/Phil/sampleselection/source/cluster.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,91 @@ ClusterEvents <- function (num.groups = 'auto',

}


ClusterForInspection2 <- function () {

vals <- GetEventsAndFeaturesTDCC() # todo: fix stuff so that it generalises to use any kind of features
event.features <- vals$event.features
events <- vals$events
params <- list()
dependencies <- list()
# events could be segment events or AED events
dependencies[[events$name]] <- events$version
dependencies[[event.features$name]] <- event.features$version

# 240 clusters, each clustered into 12 clusters
# hierachy of 2 levels
num.clusters <- c(240,12)

res.1 <- DoClusterKmeans(event.features$data, num.clusters = num.clusters[1])
groups.df <- CreateEventGroups.kmeans(events$data, res.1)




}


SubCluster <- function (tdccs = NULL) {


clustered.events <- datatrack::ReadDataobject('clustered.events')
if (is.null(tdccs)) {
tdccs <- datatrack::ReadDataobject('TDCCs')
}

features <- tdccs$data[tdccs$data$event.id %in% clustered.events$data$event.id,]

if (!all(features$event.id == clustered.events$event.id)) {
# possibly need to sort
stop('event id mismatch')
}

# remove event id column from features.
# Event id can be found in corresponding row of clustered.events df
features <- features[,-(which(colnames(features) == 'event.id'))]

# 240 clusters, each clustered into 12 clusters
# hierachy of 2 levels
num.sub.clusters <- 12
clustered.events$data$sub.group <- NA

groups <- unique(clustered.events$data$X240)

for (g in groups) {

subset <- clustered.events$data$X240 == g

if (sum(subset) <= num.sub.clusters) {
clustered.events$data$sub.group[subset] = 1:sum(subset)
} else {
group.features <- features[subset,]

# this would be more efficient if we calculate 1 distance matrix and subset it as well,
# rather than calculating the distance matrix again for each sub clustering
res.1 <- DoClusterKmeans(group.features, num.clusters = num.sub.clusters)
clustered.events$data$sub.group[subset] <- res.1[[1]]$cluster

}




}

params <- list(num.sub.clusters = num.sub.clusters)
dependencies <- list(TDCCs = tdccs$version, clustered.events = clustered.events$version)

datatrack::WriteDataobject(clustered.events$data, 'sub.clustered.events', params = params, dependencies = dependencies)

return(clustered.events$data)




}


# bug: sometimes returns events and features with different number of rows!
GetEventsAndFeaturesTDCC <- function () {
# gets the events and features data frames from saved output
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
{"datatrack.directory":"~/Documents/sample_selection_output", "as.object":["clustering", "clustering.HA","clustering.kmeans", "ranked.samples", "species.in.each.min", "optimal.samples","silence.model"]}
Q
{"datatrack.directory":"~/Documents/sample_selection_output", "as.object":["clustering", "clustering.HA","clustering.kmeans", "ranked.samples", "species.in.each.min", "optimal.samples","silence.model"]}
191 changes: 191 additions & 0 deletions AudioAnalysis/RCode/Phil/sampleselection/source/inspection.R
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,197 @@ InspectClusters <- function (cluster.groups = NA, duration.per.group = 30, max.g

}

InspectClusters.segment.nested <- function (clusters = NULL, max.clusters = 5) {
# for some given clusters, shows some segment events belonging to each of those clusters
#
# Args:
# clusters: vector of ints; which clusters
# num.segments: int; number of segments to show for each cluster
# max.groups: if clusters is not provided it will be set to either the number of clusters or max.groups (whichever is smaller)
#
# Details:
# Randomply chooses segments from the group 1 by 1
# It then generates spectrogram of the event plus the padding, and appends them to form a spectrogram of length duration.per.group
# each clustergroup will have its own row
#

events <- datatrack::ReadDataobject('filtered.segment.events') #

clustered.events <- datatrack::ReadDataobject('sub.clustered.events') # contains only group and event id and min id
clustering <- datatrack::ReadDataobject('clustering.kmeans')


# double check that the event ids match correctly
if (!all(clustered.events$data$event.id == events$data$event.id)) {
stop('event id of events and clustered events data don\'t match')
}

# get the different clusterings for different number of clusters
clusterings <- colnames(clustered.events$data)
clusterings <- clusterings[!clusterings %in% c('event.id', 'min.id', 'sub.group')]

# combine the event columns to the clustered event data frame
clustered.events.data <- cbind(clustered.events$data, events$data[,-(which(colnames(events$data) == 'event.id'))])


# clustered.events.data <- AttachSpeciesToEvents(clustered.events.data)
# only use events that have a species id
# clustered.events.data <- clustered.events.data[!is.na(clustered.events.data$species.id),]


if (length(clusterings) > 1) {
which.k <- GetUserChoice(clusterings, 'which clustering (which size k) for inspection')
} else {
which.k <- 1
}

distance.matrix <- as.matrix(dist(clustering$data[[which.k]]$centers))

group.col <- clusterings[which.k]

# this should really just be 1:num.clusters, but to be safe do it like this
all.groups <- unique(clustered.events.data[,group.col])
all.groups <- all.groups[order(all.groups)]

if (is.numeric(clusters)) {
# make sure that the cluster groups given as a param are actually real groups
groups.that.exist <- clusters %in% all.groups
groups.that.dont.exist <- clusters[!groups.that.exist]
if (length(groups.that.dont.exist) > 0) {
Report(1, 'Clusters specified by user that don\'t exist. Ignoring:', groups.that.dont.exist)
}
clusters <- clusters[groups.that.exist]
} else {
# no clusters specified as function param, so
clusters <- all.groups
}

# make sure max is not exceeded
if(length(clusters) > max.clusters) {
removed <- clusters[(max.clusters + 1):length(clusters)]
clusters <- clusters[1:max.clusters]
Report(3, 'Number of clusters to render was greater than max.groups ... Ignoring the following clusters:', paste(removed, collapse = ','))
}

selected.events <- clustered.events.data[clustered.events.data[,group.col] %in% clusters,]

selected.events$segment.duration <- 1
selected.events$dist.from.first <- NA
selected.events$dist.from.prev <- NA


homogeneity <- function () {

#unfinished

# for cluster quality evaluation
# https://aclweb.org/anthology/D/D07/D07-1043.pdf
C <- unique(selected.events$species.id)
K <- unique(selected.events[,group.col])
contingency.table <- table(C,K)

homogenity.group <- function (group, C, K, contingency.table) {
N <- sum(contingency.table)
#vector of the number of times each class appears in this group
Ack <- contingency.table[,as.character(group)]
num.in.this.group <- sum(Ack)
H <- (Ack / N) * log(Ack / num.in.this.group)
}


}


include.event = rep(TRUE, nrow(selected.events))

num.per.subgroup <- 3

# for each cluster,
# calculate homogeneity
# limit the number of segments shown
for (group in clusters) {

group.subset <- selected.events[,group.col] == group

sub.groups = unique(selected.events$sub.group[group.subset])

for (sub.group in sub.groups) {



subgroup.subset <- selected.events$sub.group[group.subset] == sub.group
num.in.subgroup <- sum(subgroup.subset)

# select some random segments from the subgroup
if (num.in.subgroup > num.per.subgroup) {
# set seed to save time while developing (don't generate as many spectrograms)
set.seed(42)
num.to.remove <- num.in.subgroup-num.per.subgroup
include.event[group.subset][subgroup.subset][sample(num.in.subgroup,num.to.remove)] <- FALSE
Report(4, 'Selecting', paste0(num.per.subgroup,'/',num.in.subgroup), 'for group', paste0(group, '.', sub.group))
}

}


# add information about the cluster centroid distance for this group from the previous group and from the first group
if (group > 1) {
selected.events$dist.from.prev[group.subset] <- round(distance.matrix[group-1,group], 2)
} else {
selected.events$dist.from.prev[group.subset] <- 0
}
selected.events$dist.from.first[group.subset] <- round(distance.matrix[1,group], 2)

}

selected.events <- selected.events[include.event,]


temp.dir <- TempDirectory()

spectro.list <- SaveSpectroImgsForInspection(selected.events, temp.dir, noise.reduction = noise.reduction)




col.names <- colnames(selected.events)
col.names[col.names == group.col] <- 'group'
colnames(selected.events) <- col.names

seg.time <- SetTime(selected.events$min, selected.events$start.sec)
seg.sec.of.day <- selected.events$min * 60 + selected.events$start.sec


selected.events$spectro.img.path <- spectro.list

selected.events$img.title <- paste(selected.events$event.id,
selected.events$site,
selected.events$date,
seg.time,
selected.events$min,
paste0('species.id:', selected.events$species.id), sep = ' | ')


selected.events$link <- BawLink(site = selected.events$site,
date = selected.events$date,
start.sec = seg.sec.of.day,
end.sec = seg.sec.of.day + 1,
margin = 2)


# sort by cluster so that the distance from previous makes sense
# secondary sorting by eventID for the order of segments in each row
selected.events <- selected.events[order(selected.events$group, selected.events$event.id),]


html.file <- file.path(temp.dir,paste0('inspect.segments.', format(Sys.time(), format="%y%m%d_%H%M%S"), '.html'))

templator::HtmlInspector(template.path = file.path('templates','segment.event.subcluster.inspector.html'), output.path = html.file, selected.events, list(title = "inspect segments"))

print(paste("output saved to ", temp.dir))


}


InspectClusters.segment <- function (clusters = NULL, num.segments = 5, max.clusters = 5, segment.duration = 1, noise.reduction = 0.5) {
Expand Down
2 changes: 1 addition & 1 deletion AudioAnalysis/RCode/Phil/sampleselection/source/paths.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# the paths are checked one by one to find the one that exists

paths <- list(
audio = c("/Volumes/passport/phil/SERF/serf_audio",
audio = c("/Volumes/passport/serf_audio",
"/Volumes/files/qut_data/SERF/serf_audio",
"D:/phil/SERF/serf_audio",
"~/Desktop/SERF/serf_audio"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ ClassifySegments <- function (segments = NULL, silence.features = NULL, use.save
# TODO: add audio path here

if (is.null(silence.features) && is.list(segments) && use.saved.features) {
silence.features <- datatrack::ReadDataobject(name = 'silence.features', dependencies = list('segment.events' = segments$version), false.if.missing = TRUE)
silence.features <- datatrack::ReadDataobject(name = 'silence.features', dependencies = list(segment.events = segments$version), false.if.missing = TRUE)
}

if (is.list(silence.features)) {
Expand Down Expand Up @@ -458,7 +458,7 @@ AudioPath <- function (fn = NULL, input.directory = NULL) {
}
}

CalculateSilenceFeatures <- function (seconds, wavecol = c('wav.file','wave.path'), parallel = 5) {
CalculateSilenceFeatures <- function (seconds, wavecol = c('wav.file','wave.path'), parallel = 2) {

#debugging
#seconds <- seconds[1:4000,]
Expand Down
Loading

0 comments on commit 6b7984e

Please sign in to comment.