#' @title Admixture analysis #' @param tietue a named record list #' @details If the record == -1, the mixture results file is loaded. Otherwise, #' will the required variables be retrieved from the record fields? #' `tietue`should contain the following elements: PARTITION, COUNTS, SUMCOUNTS, #' alleleCodes, adjprior, popnames, rowsFromInd, data, npops, noalle #' @param tietue tietue #' @param PARTITION PARTITION #' @param COUNTS COUNTS #' @param SUMCOUNTS SUMCOUNTS #' @importFrom methods is #' @export admix1 <- function(tietue, PARTITION = matrix(NA, 0, 0), COUNTS = matrix(NA, 0, 0), SUMCOUNTS = NA) { if (!is.list(tietue)) { message('Load mixture result file. These are the files in this directory:') print(list.files()) pathname_filename <- file.choose() if (!file.exists(pathname_filename)) { stop( "File ", pathname_filename, " does not exist. Check spelling and location." ) } else { cat('---------------------------------------------------\n'); message('Reading mixture result from: ', pathname_filename, '...') } Sys.sleep(0.0001) #ASK: what for? # ASK: what is this supposed to do? What do graphic obj have to do here? # h0 = findobj('Tag','filename1_text'); # set(h0,'String',filename); clear h0; struct_array <- load(pathname_filename) if (isfield(struct_array, 'c')) { #Matlab versio c <- struct_array$c if (!isfield(c, 'PARTITION') | !isfield(c,'rowsFromInd')) { stop('Incorrect file format') } } else if (isfield(struct_array, 'PARTITION')) { #Mideva versio c <- struct_array if (!isfield(c,'rowsFromInd')) stop('Incorrect file format') } else { stop('Incorrect file format') } if (isfield(c, 'gene_lengths') & strcmp(c$mixtureType, 'linear_mix') | strcmp(c$mixtureType, 'codon_mix')) { # if the mixture is from a linkage model # Redirect the call to the linkage admixture function. # call function noindex to remove the index column c$data <- noIndex(c$data, c$noalle) # linkage_admix(c) # ASK: translate this function to R or drop? # return stop("linkage_admix not implemented") } PARTITION <- c$PARTITION COUNTS <- c$COUNTS SUMCOUNTS <- c$SUMCOUNTS alleleCodes <- c$alleleCodes adjprior <- c$adjprior popnames <- c$popnames rowsFromInd <- c$rowsFromInd data <- c$data npops <- c$npops noalle <- c$noalle } else { PARTITION <- tietue$PARTITION COUNTS <- tietue$COUNTS SUMCOUNTS <- tietue$SUMCOUNTS alleleCodes <- tietue$alleleCodes adjprior <- tietue$adjprior popnames <- tietue$popnames rowsFromInd <- tietue$rowsFromInd data <- as.double(tietue$data) npops <- tietue$npops noalle <- tietue$noalle } answers <- inputdlg( prompt = paste( "Input the minimum size of a population that will", "be taken into account when admixture is estimated." ), definput = 5 ) alaRaja <- as.numeric(answers) npops <- poistaLiianPienet(npops, rowsFromInd, alaRaja) nloci <- size(COUNTS, 2) ninds <- size(data, 1) / rowsFromInd answers <- inputdlg('Input number of iterations', 50) if (isempty(answers)) return() iterationCount <- as.numeric(answers[1, 1]) # maybe [[]]? answers <- inputdlg( prompt = 'Input number of reference individuals from each population', definput = 50 ) if (isempty(answers)) { nrefIndsInPop <- 50 } else { nrefIndsInPop <- as.numeric(answers[1, 1]) } answers <- inputdlg( prompt = 'Input number of iterations for reference individuals', definput = 10 ) if (isempty(answers)) return() iterationCountRef <- as.numeric(answers[1, 1]) # First calculate log-likelihood ratio for all individuals: likelihood <- zeros(ninds, 1) allfreqs <- computeAllFreqs2(noalle) for (ind in 1:ninds) { omaFreqs <- computePersonalAllFreqs(ind, data, allfreqs, rowsFromInd) osuusTaulu <- zeros(1, npops) if (PARTITION[ind] == 0) { # Yksil?on outlier } else if (PARTITION[ind] != 0) { if (PARTITION[ind] > 0) { osuusTaulu[PARTITION[ind]] <- 1 } else { # Yksilöt, joita ei ole sijoitettu mihinkään koriin. arvot <- zeros(1, npops) for (q in 1:npops) { osuusTaulu <- zeros(1, npops) osuusTaulu[q] <- 1 arvot[q] <- computeIndLogml(omaFreqs, osuusTaulu) } iso_arvo <- max(arvot) isoimman_indeksi <- match(max(arvot), arvot) osuusTaulu <- zeros(1, npops) osuusTaulu[isoimman_indeksi] <- 1 PARTITION[ind] <- isoimman_indeksi } logml <- computeIndLogml(omaFreqs, osuusTaulu) logmlAlku <- logml for (osuus in c(0.5, 0.25, 0.05, 0.01)) { etsiResult <- etsiParas(osuus, osuusTaulu, omaFreqs, logml) osuusTaulu <- etsiResult[1] logml <- etsiResult[2] } logmlLoppu <- logml likelihood[ind] <- logmlLoppu - logmlAlku } } # Analyze further only individuals who have log-likelihood ratio larger than 3: to_investigate <- t(find(likelihood > 3)) cat('Possibly admixed individuals:\n') for (i in 1:length(to_investigate)) { cat(as.character(to_investigate[i])) } cat(' ') cat('Populations for possibly admixed individuals:\n') admix_populaatiot <- unique(PARTITION[to_investigate]) for (i in 1:length(admix_populaatiot)) { cat(as.character(admix_populaatiot[i])) } # THUS, there are two types of individuals, who will not be analyzed with # simulated allele frequencies: those who belonged to a mini-population # which was removed, and those who have log-likelihood ratio less than 3. # The value in the PARTITION for the first kind of individuals is 0. The # second kind of individuals can be identified, because they do not # belong to "to_investigate" array. When the results are presented, the # first kind of individuals are omitted completely, while the second kind # of individuals are completely put to the population, where they ended up # in the mixture analysis. These second type of individuals will have a # unit p-value. # Simulate allele frequencies a given number of times and save the average # result to "proportionsIt" array. proportionsIt <- zeros(ninds, npops) for (iterationNum in 1:iterationCount) { cat('Iter:', as.character(iterationNum)) allfreqs <- simulateAllFreqs(noalle) # Allele frequencies on this iteration. for (ind in to_investigate) { #disp(num2str(ind)); omaFreqs <- computePersonalAllFreqs( ind, data, allfreqs, rowsFromInd ) osuusTaulu <- zeros(1, npops) if (PARTITION[ind] == 0) { # Yksil?on outlier } else if (PARTITION[ind] != 0) { if (PARTITION[ind] > 0) { osuusTaulu[PARTITION[ind]] <- 1 } else { # Yksilöt, joita ei ole sijoitettu mihinkään koriin. arvot <- zeros(1, npops) for (q in 1:npops) { osuusTaulu <- zeros(1, npops) osuusTaulu[q] <- 1 arvot[q] <- computeIndLogml(omaFreqs, osuusTaulu) } iso_arvo <- max(arvot) isoimman_indeksi <- match(max(arvot), arvot) osuusTaulu <- zeros(1, npops) osuusTaulu[isoimman_indeksi] <- 1 PARTITION[ind] <- isoimman_indeksi } logml <- computeIndLogml(omaFreqs, osuusTaulu) for (osuus in c(0.5, 0.25, 0.05, 0.01)) { etsiResult <- etsiParas(osuus, osuusTaulu, omaFreqs, logml) osuusTaulu <- etsiResult[1] logml <- etsiResult[2] } } proportionsIt[ind, ] <- proportionsIt[ind, ] * (iterationNum - 1) + osuusTaulu proportionsIt[ind, ] <- proportionsIt[ind, ] / iterationNum } } #disp(['Creating ' num2str(nrefIndsInPop) ' reference individuals from ']); #disp('each population.'); #allfreqs = simulateAllFreqs(noalle); # Simuloidaan alleelifrekvenssisetti allfreqs <- computeAllFreqs2(noalle); # Koitetaan tällaista. # Initialize the data structures, which are required in taking the missing # data into account: n_missing_levels <- zeros(npops, 1) # number of different levels of "missingness" in each pop (max 3). missing_levels <- zeros(npops, 3) # the mean values for different levels. missing_level_partition <- zeros(ninds, 1) # level of each individual (one of the levels of its population). for (i in 1:npops) { inds <- find(PARTITION == i) # Proportions of non-missing data for the individuals: non_missing_data <- zeros(length(inds), 1) for (j in 1:length(inds)) { ind <- inds[j] non_missing_data[j] <- length( find(data[(ind - 1) * rowsFromInd + 1:ind * rowsFromInd, ] > 0) ) / (rowsFromInd * nloci) } if (all(non_missing_data > 0.9)) { n_missing_levels[i] <- 1 missing_levels[i, 1] <- mean(non_missing_data) missing_level_partition[inds] <- 1 } else { # TODO: fix syntax # [ordered, ordering] = sort(non_missing_data); ordered <- ordering <- sort(non_missing_data) #part = learn_simple_partition(ordered, 0.05); part <- learn_partition_modified(ordered) aux <- sortrows(cbind(part, ordering), 2) part = aux[, 1] missing_level_partition[inds]<- part n_levels <- length(unique(part)) n_missing_levels[i] <- n_levels for (j in 1:n_levels) { missing_levels[i, j] <- mean(non_missing_data[find(part == j)]) } } } # Create and analyse reference individuals for populations # with potentially admixed individuals: refTaulu <- zeros(npops, 100, 3) for (pop in t(admix_populaatiot)) { for (level in 1:n_missing_levels[pop]) { potential_inds_in_this_pop_and_level <- find( PARTITION == pop & missing_level_partition == level & likelihood > 3 ) # Potential admix individuals here. if (!isempty(potential_inds_in_this_pop_and_level)) { #refData = simulateIndividuals(nrefIndsInPop,rowsFromInd,allfreqs); refData <- simulateIndividuals( nrefIndsInPop, rowsFromInd, allfreqs, pop, missing_levels[pop, level] ) cat( 'Analysing the reference individuals from pop', pop, '(level', level, ').' ) refProportions <- zeros(nrefIndsInPop, npops) for (iter in 1:iterationCountRef) { #disp(['Iter: ' num2str(iter)]); allfreqs <- simulateAllFreqs(noalle) for (ind in 1:nrefIndsInPop) { omaFreqs <- computePersonalAllFreqs( ind, refData, allfreqs, rowsFromInd ) osuusTaulu <- zeros(1, npops) osuusTaulu[pop] <- 1 logml <- computeIndLogml(omaFreqs, osuusTaulu) for (osuus in c(0.5, 0.25, 0.05, 0.01)) { etsiResult <- etsiParas( osuus, osuusTaulu, omaFreqs, logml ) osuusTaulu <- etsiResult[1] logml <- etsiResult[2] } refProportions[ind, ] <- refProportions[ind, ] * (iter - 1) + osuusTaulu refProportions[ind, ] <- refProportions[ind, ] / iter } } for (ind in 1:nrefIndsInPop) { omanOsuus <- refProportions[ind, pop] if (round(omanOsuus * 100) == 0) { omanOsuus <- 0.01 } if (abs(omanOsuus) < 1e-5) { omanOsuus <- 0.01 } refTaulu[pop, round(omanOsuus*100), level] <- refTaulu[pop, round(omanOsuus*100),level] + 1 } } } } # Rounding of the results: proportionsIt <- proportionsIt * 100 proportionsIt <- round(proportionsIt) proportionsIt <- proportionsIt / 100 for (ind in 1:ninds) { if (!any(to_investigate == ind)) { if (PARTITION[ind] > 0) { proportionsIt[ind, PARTITION[ind]] <- 1 } } else { # In case of a rounding error, the sum is made equal to unity by # fixing the largest value. if ((PARTITION[ind] > 0) & (sum(proportionsIt[ind, ]) != 1)) { isoin <- max(proportionsIt[ind, ]) indeksi <- match(isoin, max(proportionsIt[ind, ])) erotus <- sum(proportionsIt[ind, ]) - 1 proportionsIt[ind, indeksi] <- isoin - erotus } } } # Calculate p-value for each individual: uskottavuus <- zeros(ninds, 1) for (ind in 1:ninds) { pop <- PARTITION[ind] if (pop == 0) { # Individual is outlier uskottavuus[ind] <- 1 } else if (isempty(find(to_investigate == ind))) { # Individual had log-likelihood ratio<3 uskottavuus[ind] <- 1 } else { omanOsuus <- proportionsIt[ind, pop] if (abs(omanOsuus) < 1e-5) { omanOsuus <- 0.01 } if (round(omanOsuus*100)==0) { omanOsuus <- 0.01 } level <- missing_level_partition[ind] refPienempia <- sum(refTaulu[pop, 1:round(100*omanOsuus), level]) uskottavuus[ind] <- refPienempia / nrefIndsInPop } } # ASK: Remove? are these plotting functions? tulostaAdmixtureTiedot(proportionsIt, uskottavuus, alaRaja, iterationCount) viewPartition(proportionsIt, popnames) talle = inputdlg('Do you want to save the admixture results? [Y/n]', 'y') if (talle %in% c('y', 'Y', 'yes', 'Yes')) { #waitALittle; filename <- inputdlg( 'Save results as (file name):', 'admixture_results.rda' ) if (filename == 0) { # Cancel was pressed return() } else { # copy 'baps4_output.baps' into the text file with the same name. if (file.exists('baps4_output.baps')) { file.copy('baps4_output.baps', paste0(filename, '.txt')) file.remove('baps4_output.baps') } } if (!is(tietue, "list")) { c$proportionsIt <- proportionsIt c$pvalue <- uskottavuus # Added by Jing c$mixtureType <- 'admix' # Jing c$admixnpops <- npops; save(c, file=filename) } else { tietue$proportionsIt <- proportionsIt tietue$pvalue <- uskottavuus; # Added by Jing tietue$mixtureType <- 'admix' tietue$admixnpops <- npops save(tietue, file=filename) } } }