#!/usr/bin/Rscript
# Genomik und Bioinformatik I
# Projekt 1 (Identify languages by text analysis)
# written by Andreas Loibl (enrollment no. 1524148)

source(file="readText.R")

# identifyGermanNewspaper:
###########################
#   distinguishes texts from the German newspapers FAZ and Bild by analyzing
#   the relative frequency of punctuation marks in the text. This algorithm
#   is based on the assumption a text with a high count of "?" and "!" is
#   likely to be a "Bild"-text, because Bild quite often uses sensational
#   expressions.
#   
#   So this function is able to decide if the given text is taken from Bild
#   or from FAZ (or generic "Text" if the data is not distinguishable enough)

identifyGermanNewspaper <- function(inputText) {
	count <- numeric()
	for(i in c("!", "\\?", "\\. ", ",")) {
		tmp <- unlist(gregexpr(i,inputText))
		if(tmp[1] > 0) count[i] <- length(tmp) else count[i] <- 0
	}
	if(sum(count)>5) {
		if((count["\\?"]+count["!"])/sum(count) > 0.1) return("Bild")
		if((count["\\?"]+count["!"])/sum(count) < 0.05) return("FAZ")
		print("Data is not distinguishable enough.")
		return("Text")
	} else {
		print("No or not enough processable data found.")
		return("Text")
	}
}

# identifyLanguageOfFile:
##########################
#   distinguishes the language of a file by using text analysis on its content.
#   
#   the algorithm counts the occurrences of several character-combinations
#   (like "oc", "og", "ei", "ar", "se") and estimates what language the text
#   is written in by comparing the relative frequencys of these character-
#   combinations with values.

identifyLanguageOfFile <- function(file) {
	inputText <- readText(file)
	count <- numeric()
	for(i in c("og", "oc", "ei")) {
		tmp <- unlist(gregexpr(i,inputText))
		if(tmp[1] > 0) count[i] <- length(tmp) else count[i] <- 0
	}
	if(sum(count)>0) {
		if((count["og"])/sum(count) > 0.40) return("Norwegian")
		if((count["oc"])/sum(count) > 0.50) return("Swedish")
		if((count["ei"])/sum(count) > 0.45) return(paste("German", "(", identifyGermanNewspaper(inputText), ")"))
	}
	count <- numeric()
	for(i in c("ar", "se")) {
		tmp <- unlist(gregexpr(i,inputText))
		if(tmp[1] > 0) count[i] <- length(tmp) else count[i] <- 0
	}
	if(sum(count)>0) {
		if((count["ar"])/sum(count) > 0.65) return("Swedish")
		if((count["se"])/sum(count) > 0.65) return("Norwegian")
		print("Data is not distinguishable enough.")
		return(FALSE)
	}
	print("No processable data found.")
	return(FALSE)
}

for(file in commandArgs(TRUE)) print(identifyLanguageOfFile(file))