#!/usr/bin/Rscript # Genomik und Bioinformatik I # Projekt 1 (Identify languages by text analysis) # written by Andreas Loibl (enrollment no. 1524148) source(file="readText.R") # identifyGermanNewspaper: ########################### # distinguishes texts from the German newspapers FAZ and Bild by analyzing # the relative frequency of punctuation marks in the text. This algorithm # is based on the assumption a text with a high count of "?" and "!" is # likely to be a "Bild"-text, because Bild quite often uses sensational # expressions. # # So this function is able to decide if the given text is taken from Bild # or from FAZ (or generic "Text" if the data is not distinguishable enough) identifyGermanNewspaper <- function(inputText) { count <- numeric() for(i in c("!", "\\?", "\\. ", ",")) { tmp <- unlist(gregexpr(i,inputText)) if(tmp[1] > 0) count[i] <- length(tmp) else count[i] <- 0 } if(sum(count)>5) { if((count["\\?"]+count["!"])/sum(count) > 0.1) return("Bild") if((count["\\?"]+count["!"])/sum(count) < 0.05) return("FAZ") print("Data is not distinguishable enough.") return("Text") } else { print("No or not enough processable data found.") return("Text") } } # identifyLanguageOfFile: ########################## # distinguishes the language of a file by using text analysis on its content. # # the algorithm counts the occurrences of several character-combinations # (like "oc", "og", "ei", "ar", "se") and estimates what language the text # is written in by comparing the relative frequencys of these character- # combinations with values. identifyLanguageOfFile <- function(file) { inputText <- readText(file) count <- numeric() for(i in c("og", "oc", "ei")) { tmp <- unlist(gregexpr(i,inputText)) if(tmp[1] > 0) count[i] <- length(tmp) else count[i] <- 0 } if(sum(count)>0) { if((count["og"])/sum(count) > 0.40) return("Norwegian") if((count["oc"])/sum(count) > 0.50) return("Swedish") if((count["ei"])/sum(count) > 0.45) return(paste("German", "(", identifyGermanNewspaper(inputText), ")")) } count <- numeric() for(i in c("ar", "se")) { tmp <- unlist(gregexpr(i,inputText)) if(tmp[1] > 0) count[i] <- length(tmp) else count[i] <- 0 } if(sum(count)>0) { if((count["ar"])/sum(count) > 0.65) return("Swedish") if((count["se"])/sum(count) > 0.65) return("Norwegian") print("Data is not distinguishable enough.") return(FALSE) } print("No processable data found.") return(FALSE) } for(file in commandArgs(TRUE)) print(identifyLanguageOfFile(file))