From b5d99903d212a93aa5473448b92cc36bb3c19311 Mon Sep 17 00:00:00 2001 From: Waldir Leoncio Date: Mon, 16 Dec 2019 16:47:21 +0100 Subject: [PATCH] Added source Matlab code for reference --- .Rbuildignore | 3 +- ...coordinates for the spatial clustering.txt | 3 + ...wise clustering of diploid individuals.txt | 10 + ...wise clustering of haploid individuals.txt | 5 + ...e clustering of tetraploid individuals.txt | 20 + ...se clustering of haploid individuals.fasta | 20 + .../group names.txt | 3 + .../group partition for FASTA-format.txt | 5 + ...or the spatial clustering (not unique).txt | 5 + ...coordinates for the spatial clustering.txt | 5 + ... for clustering of diploid individuals.txt | 10 + ... for clustering of haploid individuals.txt | 5 + ...r clustering of tetraploid individuals.txt | 20 + ...or clustering of haploid individuals.fasta | 20 + .../individual indices in population.txt | 5 + .../population names.txt | 3 + matlab/add_BAPS_to_path.m | 41 + matlab/admixture/admix1.m | 846 ++++++ matlab/admixture/admix2.m | 622 ++++ matlab/admixture/calcGeneLengths.m | 6 + matlab/admixture/learn_simple_partition.m | 70 + matlab/admixture/linkage_admix.m | 1034 +++++++ matlab/compileBaps6.m | 6 + matlab/general/baps4cbf.m | 993 +++++++ matlab/general/baps6.m | 377 +++ matlab/general/distance.m | 46 + matlab/general/fixKWarning.m | 14 + matlab/general/goToFixedK.m | 28 + matlab/general/goToPartitionCompare.m | 138 + matlab/general/iconn.m | 18 + matlab/general/imageCbf.m | 67 + matlab/general/image_figure.m | 38 + matlab/general/kstest2.m | 178 ++ matlab/general/logml2String.m | 67 + matlab/general/noIndex.m | 11 + matlab/general/ownNum2Str.m | 35 + matlab/general/setWindowOnTop.m | 124 + matlab/general/seticon.m | 59 + matlab/general/waitALittle.m | 3 + matlab/general/wgetname.m | 21 + matlab/general/winontop.mexw32 | Bin 0 -> 6144 bytes matlab/general/zscore.m | 51 + matlab/graph/@phyTree/Display.m | 26 + matlab/graph/@phyTree/GET.m | 89 + matlab/graph/@phyTree/Plot.m | 629 ++++ matlab/graph/@phyTree/Prune.m | 197 ++ matlab/graph/@phyTree/SET.m | 21 + matlab/graph/@phyTree/Select.m | 313 ++ matlab/graph/@phyTree/Subtree.m | 91 + matlab/graph/@phyTree/View.m | 1618 ++++++++++ matlab/graph/@phyTree/Weights.m | 54 + matlab/graph/@phyTree/getByName.m | 103 + matlab/graph/@phyTree/getCanonical.m | 65 + matlab/graph/@phyTree/getMatrix.m | 62 + matlab/graph/@phyTree/getNewickStr.m | 133 + matlab/graph/@phyTree/openVar.m | 12 + matlab/graph/@phyTree/pDist.m | 153 + matlab/graph/@phyTree/phyTree.m | 235 ++ matlab/graph/@phyTree/private/Opttf.m | 31 + matlab/graph/@phyTree/private/prettyOrder.m | 39 + matlab/graph/@phyTree/reRoot.m | 206 ++ matlab/graph/allfreqsnew2.m | 90 + matlab/graph/chooseDistance.m | 242 ++ matlab/graph/double2labels.m | 15 + matlab/graph/giveColors.m | 14 + matlab/graph/graph2dot.m | 139 + matlab/graph/graphvis2.m | 108 + matlab/graph/ksdensity_myown.m | 623 ++++ matlab/graph/linkage.m | 117 + matlab/graph/mutationPlot.m | 330 ++ matlab/graph/phyTreeRead.m | 197 ++ matlab/graph/plotPhytree.m | 687 +++++ matlab/graph/plotVarmuus.m | 102 + matlab/graph/plotflow.m | 282 ++ matlab/graph/plotmodel.m | 342 +++ matlab/graph/population_names_figure.m | 100 + matlab/graph/seqNeighJoin.m | 250 ++ matlab/graph/seqlinkage.m | 123 + matlab/graph/showColors.m | 16 + matlab/graph/showColors2.m | 16 + matlab/graph/statgetargs.m | 83 + matlab/graph/viewDendrogram.m | 56 + matlab/graph/viewMixPartition.m | 88 + matlab/graph/viewPartition.m | 97 + matlab/graph/viewPartition2.m | 103 + matlab/graph/viewPartition3.m | 204 ++ matlab/graph/viewPartition4.m | 108 + matlab/graph/viewPhylogeny.m | 96 + matlab/graph/viewPopMixPartition.m | 92 + matlab/graph/viewUnrooted.m | 61 + matlab/graph/view_admixture.m | 254 ++ matlab/graph/view_energy.m | 488 +++ matlab/graph/view_geneflow.m | 183 ++ matlab/graph/view_loglikelihood.m | 476 +++ matlab/graph/vorPlot.m | 88 + matlab/graph/waitALittle.m | 3 + matlab/graph/winontop.m | 8 + matlab/independent/greedyMix.m | 1788 +++++++++++ matlab/independent/greedyPopMix.m | 1685 +++++++++++ matlab/independent/indMix.m | 1234 ++++++++ matlab/independent/indMix_fixK.m | 1337 +++++++++ matlab/independent/myxlsread.m | 49 + matlab/independent/myxlswrite.m | 19 + matlab/independent/preprocessXLS.m | 177 ++ matlab/independent/processxls.m | 81 + matlab/independent/semiReadScript.m | 128 + matlab/independent/semi_linkageMix.m | 1470 +++++++++ .../independent/semi_linkageMixture_speed.m | 651 ++++ matlab/independent/tmpscript.txt | 5 + matlab/independent/trainedMix.m | 2293 ++++++++++++++ matlab/linkage/CSEFlagDialog.m | 404 +++ matlab/linkage/CSEFlagDialog1.m | 404 +++ matlab/linkage/allfreqsnew.m | 63 + matlab/linkage/allfreqsnew2.m | 89 + matlab/linkage/allfreqsnew3.m | 63 + matlab/linkage/allfreqsnew4.m | 90 + matlab/linkage/askSeq.m | 26 + matlab/linkage/choosebox.m | 353 +++ matlab/linkage/dendrogram_alpha.m | 379 +++ matlab/linkage/encodealn.m | 31 + matlab/linkage/i_encode_n.m | 61 + matlab/linkage/linkageMix.m | 1277 ++++++++ matlab/linkage/linkageMix_fixK.m | 1359 +++++++++ matlab/linkage/linkageMixture_speed.m | 2130 +++++++++++++ matlab/linkage/makecomplete.m | 70 + matlab/linkage/myCell2mat.m | 120 + matlab/linkage/processprofile.m | 35 + matlab/linkage/processxls.m | 77 + matlab/linkage/readbaps.m | 188 ++ matlab/linkage/readfasta.m | 171 ++ matlab/linkage/selectDataType.m | 37 + matlab/linkage/selectGene.m | 25 + matlab/linkage/selectSeqType.m | 47 + matlab/linkage/seqcode.m | 20 + matlab/linkage/silentReadBaps.m | 193 ++ matlab/linkage/sumCell.m | 10 + matlab/linkage/transform2.m | 240 ++ matlab/linkage/transform4.m | 235 ++ matlab/linkage/transform5.m | 265 ++ matlab/linkage/xls2structure.m | 19 + matlab/parallel/admix_parallel.m | 776 +++++ matlab/parallel/compare.m | 81 + matlab/parallel/compare_admix.m | 142 + matlab/parallel/dispLine.m | 3 + matlab/parallel/greedyPopMix_parallel.m | 1622 ++++++++++ matlab/parallel/independent_parallel.m | 1674 +++++++++++ matlab/parallel/initPopNames.m | 35 + matlab/parallel/linkageMixture_speed.m | 2138 +++++++++++++ matlab/parallel/linkage_admix_parallel.m | 1030 +++++++ matlab/parallel/linkage_parallel.m | 1633 ++++++++++ matlab/parallel/parallel.m | 297 ++ matlab/parallel/proportion2str.m | 18 + matlab/parallel/readScript.m | 66 + matlab/parallel/sc.txt | 6 + matlab/parallel/sc2.txt | 8 + matlab/parallel/sort_partition.m | 17 + matlab/parallel/spatialPopMixture_parallel.m | 2275 ++++++++++++++ matlab/parallel/spatial_parallel.m | 2175 ++++++++++++++ matlab/parallel/tulostaAdmixtureTiedot.m | 63 + matlab/parallel/uipickfiles.m | 803 +++++ matlab/spatial/addPoints.m | 86 + matlab/spatial/findCliques.m | 509 ++++ matlab/spatial/handleIndiFastaCase.m | 307 ++ matlab/spatial/handlePopFastaCase.m | 302 ++ matlab/spatial/initSpatialMixture.m | 718 +++++ matlab/spatial/initSpatialMultiMixture.m | 18 + matlab/spatial/private/calcLogmlChanges.m | 82 + matlab/spatial/private/clearGlobalVars.m | 18 + matlab/spatial/private/cluster_own.m | 52 + matlab/spatial/private/computeCounts.m | 27 + .../spatial/private/computeDiffInCliqCounts.m | 19 + matlab/spatial/private/computeDiffInCounts.m | 7 + matlab/spatial/private/computeTotalLogml.m | 42 + matlab/spatial/private/initialCounts2.m | 17 + .../spatial/private/model_search_parallel.m | 366 +++ .../spatial/private/model_search_pregroup.m | 385 +++ matlab/spatial/private/preprocAln.m | 97 + matlab/spatial/private/rmEmptyPopulation.m | 41 + .../spatial/private/updateGlobalVariables.m | 61 + matlab/spatial/private/updateLogmlTable.m | 37 + .../private/update_difference_tables.m | 92 + .../spatial/private/update_join_difference.m | 83 + matlab/spatial/spatialMix.m | 1512 ++++++++++ matlab/spatial/spatialMix_fixK.m | 1621 ++++++++++ matlab/spatial/spatialMixture.m | 2609 ++++++++++++++++ matlab/spatial/spatialPopMixture.m | 2645 +++++++++++++++++ 186 files changed, 61405 insertions(+), 1 deletion(-) create mode 100644 matlab/ExampleData/spatial clustering of groups/Example coordinates for the spatial clustering.txt create mode 100644 matlab/ExampleData/spatial clustering of groups/Example data in BAPS format for group-wise clustering of diploid individuals.txt create mode 100644 matlab/ExampleData/spatial clustering of groups/Example data in BAPS format for group-wise clustering of haploid individuals.txt create mode 100644 matlab/ExampleData/spatial clustering of groups/Example data in BAPS format for group-wise clustering of tetraploid individuals.txt create mode 100644 matlab/ExampleData/spatial clustering of groups/Example data in FASTA format for group-wise clustering of haploid individuals.fasta create mode 100644 matlab/ExampleData/spatial clustering of groups/group names.txt create mode 100644 matlab/ExampleData/spatial clustering of groups/group partition for FASTA-format.txt create mode 100644 matlab/ExampleData/spatial clustering of individuals/Example coordinates for the spatial clustering (not unique).txt create mode 100644 matlab/ExampleData/spatial clustering of individuals/Example coordinates for the spatial clustering.txt create mode 100644 matlab/ExampleData/spatial clustering of individuals/Example data in BAPS format for clustering of diploid individuals.txt create mode 100644 matlab/ExampleData/spatial clustering of individuals/Example data in BAPS format for clustering of haploid individuals.txt create mode 100644 matlab/ExampleData/spatial clustering of individuals/Example data in BAPS format for clustering of tetraploid individuals.txt create mode 100644 matlab/ExampleData/spatial clustering of individuals/Example data in FASTA format for clustering of haploid individuals.fasta create mode 100644 matlab/ExampleData/spatial clustering of individuals/individual indices in population.txt create mode 100644 matlab/ExampleData/spatial clustering of individuals/population names.txt create mode 100644 matlab/add_BAPS_to_path.m create mode 100644 matlab/admixture/admix1.m create mode 100644 matlab/admixture/admix2.m create mode 100644 matlab/admixture/calcGeneLengths.m create mode 100644 matlab/admixture/learn_simple_partition.m create mode 100644 matlab/admixture/linkage_admix.m create mode 100644 matlab/compileBaps6.m create mode 100644 matlab/general/baps4cbf.m create mode 100644 matlab/general/baps6.m create mode 100644 matlab/general/distance.m create mode 100644 matlab/general/fixKWarning.m create mode 100644 matlab/general/goToFixedK.m create mode 100644 matlab/general/goToPartitionCompare.m create mode 100644 matlab/general/iconn.m create mode 100644 matlab/general/imageCbf.m create mode 100644 matlab/general/image_figure.m create mode 100644 matlab/general/kstest2.m create mode 100644 matlab/general/logml2String.m create mode 100644 matlab/general/noIndex.m create mode 100644 matlab/general/ownNum2Str.m create mode 100644 matlab/general/setWindowOnTop.m create mode 100644 matlab/general/seticon.m create mode 100644 matlab/general/waitALittle.m create mode 100644 matlab/general/wgetname.m create mode 100644 matlab/general/winontop.mexw32 create mode 100644 matlab/general/zscore.m create mode 100644 matlab/graph/@phyTree/Display.m create mode 100644 matlab/graph/@phyTree/GET.m create mode 100644 matlab/graph/@phyTree/Plot.m create mode 100644 matlab/graph/@phyTree/Prune.m create mode 100644 matlab/graph/@phyTree/SET.m create mode 100644 matlab/graph/@phyTree/Select.m create mode 100644 matlab/graph/@phyTree/Subtree.m create mode 100644 matlab/graph/@phyTree/View.m create mode 100644 matlab/graph/@phyTree/Weights.m create mode 100644 matlab/graph/@phyTree/getByName.m create mode 100644 matlab/graph/@phyTree/getCanonical.m create mode 100644 matlab/graph/@phyTree/getMatrix.m create mode 100644 matlab/graph/@phyTree/getNewickStr.m create mode 100644 matlab/graph/@phyTree/openVar.m create mode 100644 matlab/graph/@phyTree/pDist.m create mode 100644 matlab/graph/@phyTree/phyTree.m create mode 100644 matlab/graph/@phyTree/private/Opttf.m create mode 100644 matlab/graph/@phyTree/private/prettyOrder.m create mode 100644 matlab/graph/@phyTree/reRoot.m create mode 100644 matlab/graph/allfreqsnew2.m create mode 100644 matlab/graph/chooseDistance.m create mode 100644 matlab/graph/double2labels.m create mode 100644 matlab/graph/giveColors.m create mode 100644 matlab/graph/graph2dot.m create mode 100644 matlab/graph/graphvis2.m create mode 100644 matlab/graph/ksdensity_myown.m create mode 100644 matlab/graph/linkage.m create mode 100644 matlab/graph/mutationPlot.m create mode 100644 matlab/graph/phyTreeRead.m create mode 100644 matlab/graph/plotPhytree.m create mode 100644 matlab/graph/plotVarmuus.m create mode 100644 matlab/graph/plotflow.m create mode 100644 matlab/graph/plotmodel.m create mode 100644 matlab/graph/population_names_figure.m create mode 100644 matlab/graph/seqNeighJoin.m create mode 100644 matlab/graph/seqlinkage.m create mode 100644 matlab/graph/showColors.m create mode 100644 matlab/graph/showColors2.m create mode 100644 matlab/graph/statgetargs.m create mode 100644 matlab/graph/viewDendrogram.m create mode 100644 matlab/graph/viewMixPartition.m create mode 100644 matlab/graph/viewPartition.m create mode 100644 matlab/graph/viewPartition2.m create mode 100644 matlab/graph/viewPartition3.m create mode 100644 matlab/graph/viewPartition4.m create mode 100644 matlab/graph/viewPhylogeny.m create mode 100644 matlab/graph/viewPopMixPartition.m create mode 100644 matlab/graph/viewUnrooted.m create mode 100644 matlab/graph/view_admixture.m create mode 100644 matlab/graph/view_energy.m create mode 100644 matlab/graph/view_geneflow.m create mode 100644 matlab/graph/view_loglikelihood.m create mode 100644 matlab/graph/vorPlot.m create mode 100644 matlab/graph/waitALittle.m create mode 100644 matlab/graph/winontop.m create mode 100644 matlab/independent/greedyMix.m create mode 100644 matlab/independent/greedyPopMix.m create mode 100644 matlab/independent/indMix.m create mode 100644 matlab/independent/indMix_fixK.m create mode 100644 matlab/independent/myxlsread.m create mode 100644 matlab/independent/myxlswrite.m create mode 100644 matlab/independent/preprocessXLS.m create mode 100644 matlab/independent/processxls.m create mode 100644 matlab/independent/semiReadScript.m create mode 100644 matlab/independent/semi_linkageMix.m create mode 100644 matlab/independent/semi_linkageMixture_speed.m create mode 100644 matlab/independent/tmpscript.txt create mode 100644 matlab/independent/trainedMix.m create mode 100644 matlab/linkage/CSEFlagDialog.m create mode 100644 matlab/linkage/CSEFlagDialog1.m create mode 100644 matlab/linkage/allfreqsnew.m create mode 100644 matlab/linkage/allfreqsnew2.m create mode 100644 matlab/linkage/allfreqsnew3.m create mode 100644 matlab/linkage/allfreqsnew4.m create mode 100644 matlab/linkage/askSeq.m create mode 100644 matlab/linkage/choosebox.m create mode 100644 matlab/linkage/dendrogram_alpha.m create mode 100644 matlab/linkage/encodealn.m create mode 100644 matlab/linkage/i_encode_n.m create mode 100644 matlab/linkage/linkageMix.m create mode 100644 matlab/linkage/linkageMix_fixK.m create mode 100644 matlab/linkage/linkageMixture_speed.m create mode 100644 matlab/linkage/makecomplete.m create mode 100644 matlab/linkage/myCell2mat.m create mode 100644 matlab/linkage/processprofile.m create mode 100644 matlab/linkage/processxls.m create mode 100644 matlab/linkage/readbaps.m create mode 100644 matlab/linkage/readfasta.m create mode 100644 matlab/linkage/selectDataType.m create mode 100644 matlab/linkage/selectGene.m create mode 100644 matlab/linkage/selectSeqType.m create mode 100644 matlab/linkage/seqcode.m create mode 100644 matlab/linkage/silentReadBaps.m create mode 100644 matlab/linkage/sumCell.m create mode 100644 matlab/linkage/transform2.m create mode 100644 matlab/linkage/transform4.m create mode 100644 matlab/linkage/transform5.m create mode 100644 matlab/linkage/xls2structure.m create mode 100644 matlab/parallel/admix_parallel.m create mode 100644 matlab/parallel/compare.m create mode 100644 matlab/parallel/compare_admix.m create mode 100644 matlab/parallel/dispLine.m create mode 100644 matlab/parallel/greedyPopMix_parallel.m create mode 100644 matlab/parallel/independent_parallel.m create mode 100644 matlab/parallel/initPopNames.m create mode 100644 matlab/parallel/linkageMixture_speed.m create mode 100644 matlab/parallel/linkage_admix_parallel.m create mode 100644 matlab/parallel/linkage_parallel.m create mode 100644 matlab/parallel/parallel.m create mode 100644 matlab/parallel/proportion2str.m create mode 100644 matlab/parallel/readScript.m create mode 100644 matlab/parallel/sc.txt create mode 100644 matlab/parallel/sc2.txt create mode 100644 matlab/parallel/sort_partition.m create mode 100644 matlab/parallel/spatialPopMixture_parallel.m create mode 100644 matlab/parallel/spatial_parallel.m create mode 100644 matlab/parallel/tulostaAdmixtureTiedot.m create mode 100644 matlab/parallel/uipickfiles.m create mode 100644 matlab/spatial/addPoints.m create mode 100644 matlab/spatial/findCliques.m create mode 100644 matlab/spatial/handleIndiFastaCase.m create mode 100644 matlab/spatial/handlePopFastaCase.m create mode 100644 matlab/spatial/initSpatialMixture.m create mode 100644 matlab/spatial/initSpatialMultiMixture.m create mode 100644 matlab/spatial/private/calcLogmlChanges.m create mode 100644 matlab/spatial/private/clearGlobalVars.m create mode 100644 matlab/spatial/private/cluster_own.m create mode 100644 matlab/spatial/private/computeCounts.m create mode 100644 matlab/spatial/private/computeDiffInCliqCounts.m create mode 100644 matlab/spatial/private/computeDiffInCounts.m create mode 100644 matlab/spatial/private/computeTotalLogml.m create mode 100644 matlab/spatial/private/initialCounts2.m create mode 100644 matlab/spatial/private/model_search_parallel.m create mode 100644 matlab/spatial/private/model_search_pregroup.m create mode 100644 matlab/spatial/private/preprocAln.m create mode 100644 matlab/spatial/private/rmEmptyPopulation.m create mode 100644 matlab/spatial/private/updateGlobalVariables.m create mode 100644 matlab/spatial/private/updateLogmlTable.m create mode 100644 matlab/spatial/private/update_difference_tables.m create mode 100644 matlab/spatial/private/update_join_difference.m create mode 100644 matlab/spatial/spatialMix.m create mode 100644 matlab/spatial/spatialMix_fixK.m create mode 100644 matlab/spatial/spatialMixture.m create mode 100644 matlab/spatial/spatialPopMixture.m diff --git a/.Rbuildignore b/.Rbuildignore index bcae897..2650fe0 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,3 @@ LICENSE -TODO.md \ No newline at end of file +TODO.md +matlab \ No newline at end of file diff --git a/matlab/ExampleData/spatial clustering of groups/Example coordinates for the spatial clustering.txt b/matlab/ExampleData/spatial clustering of groups/Example coordinates for the spatial clustering.txt new file mode 100644 index 0000000..154e33b --- /dev/null +++ b/matlab/ExampleData/spatial clustering of groups/Example coordinates for the spatial clustering.txt @@ -0,0 +1,3 @@ + 3.1013610e+006 6.7104850e+006 + 3.1016310e+006 6.7101990e+006 + 3.1015790e+006 6.7101790e+006 diff --git a/matlab/ExampleData/spatial clustering of groups/Example data in BAPS format for group-wise clustering of diploid individuals.txt b/matlab/ExampleData/spatial clustering of groups/Example data in BAPS format for group-wise clustering of diploid individuals.txt new file mode 100644 index 0000000..57cff17 --- /dev/null +++ b/matlab/ExampleData/spatial clustering of groups/Example data in BAPS format for group-wise clustering of diploid individuals.txt @@ -0,0 +1,10 @@ +-9 102 56 80 100 90 118 90 88 104 1 +-9 102 54 82 102 92 116 90 86 104 1 +88 104 56 84 102 -9 120 90 88 100 1 +86 102 56 80 102 -9 116 90 86 100 1 +88 102 54 80 102 90 116 92 -9 100 2 +88 102 56 80 100 90 118 90 -9 104 2 +80 102 54 82 102 92 116 90 86 104 2 +88 104 56 84 102 92 120 90 88 100 2 +86 102 56 80 -9 90 116 90 86 100 3 +88 102 54 80 -9 90 116 92 86 100 3 diff --git a/matlab/ExampleData/spatial clustering of groups/Example data in BAPS format for group-wise clustering of haploid individuals.txt b/matlab/ExampleData/spatial clustering of groups/Example data in BAPS format for group-wise clustering of haploid individuals.txt new file mode 100644 index 0000000..afd5492 --- /dev/null +++ b/matlab/ExampleData/spatial clustering of groups/Example data in BAPS format for group-wise clustering of haploid individuals.txt @@ -0,0 +1,5 @@ +88 102 56 80 100 90 118 -9 88 104 1 +80 102 54 82 102 92 116 90 86 104 1 +88 104 56 84 102 -9 120 90 88 100 2 +86 102 56 80 102 90 116 90 86 100 2 +88 102 -9 80 102 90 116 92 86 100 3 diff --git a/matlab/ExampleData/spatial clustering of groups/Example data in BAPS format for group-wise clustering of tetraploid individuals.txt b/matlab/ExampleData/spatial clustering of groups/Example data in BAPS format for group-wise clustering of tetraploid individuals.txt new file mode 100644 index 0000000..3450819 --- /dev/null +++ b/matlab/ExampleData/spatial clustering of groups/Example data in BAPS format for group-wise clustering of tetraploid individuals.txt @@ -0,0 +1,20 @@ +88 102 56 80 100 90 118 90 88 104 1 +80 102 54 82 102 92 116 90 86 104 1 +88 102 56 80 100 90 118 90 88 104 1 +80 102 54 82 102 92 116 90 86 104 1 +88 -9 56 84 102 92 120 90 88 100 1 +86 -9 56 80 102 90 116 90 86 102 1 +88 -9 56 84 102 92 120 90 88 100 1 +86 -9 56 80 102 90 116 90 86 100 1 +88 102 54 82 102 90 116 92 86 102 2 +88 102 56 80 100 90 118 90 88 104 2 +88 102 54 80 102 90 116 92 86 102 2 +88 102 56 80 100 90 118 90 88 104 2 +80 102 54 82 102 92 116 -9 86 104 2 +88 104 56 84 102 92 120 -9 88 100 2 +80 102 54 82 102 92 116 -9 86 104 2 +88 104 56 84 102 92 120 -9 88 100 2 +86 102 56 80 102 90 116 90 86 100 3 +88 104 54 80 102 90 116 92 86 100 3 +86 102 56 80 104 90 116 90 86 100 3 +88 102 54 80 102 90 116 92 86 100 3 diff --git a/matlab/ExampleData/spatial clustering of groups/Example data in FASTA format for group-wise clustering of haploid individuals.fasta b/matlab/ExampleData/spatial clustering of groups/Example data in FASTA format for group-wise clustering of haploid individuals.fasta new file mode 100644 index 0000000..9501482 --- /dev/null +++ b/matlab/ExampleData/spatial clustering of groups/Example data in FASTA format for group-wise clustering of haploid individuals.fasta @@ -0,0 +1,20 @@ +>1 +AACGAAACGATCGCGTCACCGGAACGTTGTCCGTCTCGAATAGCACTGTGGGAACGTGTTTTACATTCGT +TAGTAACATGGTCAGCTGCTCATCCGTATT + +>2 +ATCAGCAAACGAGAAGTTGCAGAGGTCTTTGGTTTGAGCATTGCCCCCATACAATCGACTTCTGGCCTGG +AATGCACCACAAACATACCCCACAGGCTCG + +>3 +GCTTTTACTAAGGCCTATCGGATTCAACGTCACTAAGACTCGGCACTAACAGGCCGTTGTAAGCCGCTCT +GTCTGAGTATGGATGGTGGAGGCGGAGCCG + +>4 +ACCTGGACCTCTGTATTAACGGCTGTGATTCTGAGGGGGGTATCGCAGCGCACTTTCTAGCTATATCACG +CAAGGATAAAGTTCACCCATCACGTTGACC + +>5 +ACAATACGTCATCCACACCGCGCCTATGGAAGAATTTGCCCTTTCGGCGACAGCCCATGCTGTCAAGGAG +GTAACATAGCTACCAGGTCCCATTCCAGGA + diff --git a/matlab/ExampleData/spatial clustering of groups/group names.txt b/matlab/ExampleData/spatial clustering of groups/group names.txt new file mode 100644 index 0000000..0895c27 --- /dev/null +++ b/matlab/ExampleData/spatial clustering of groups/group names.txt @@ -0,0 +1,3 @@ +group_A +groupB +C \ No newline at end of file diff --git a/matlab/ExampleData/spatial clustering of groups/group partition for FASTA-format.txt b/matlab/ExampleData/spatial clustering of groups/group partition for FASTA-format.txt new file mode 100644 index 0000000..ddbf58e --- /dev/null +++ b/matlab/ExampleData/spatial clustering of groups/group partition for FASTA-format.txt @@ -0,0 +1,5 @@ +1 +1 +2 +2 +3 \ No newline at end of file diff --git a/matlab/ExampleData/spatial clustering of individuals/Example coordinates for the spatial clustering (not unique).txt b/matlab/ExampleData/spatial clustering of individuals/Example coordinates for the spatial clustering (not unique).txt new file mode 100644 index 0000000..8dbd3cd --- /dev/null +++ b/matlab/ExampleData/spatial clustering of individuals/Example coordinates for the spatial clustering (not unique).txt @@ -0,0 +1,5 @@ + 3.1013610e+006 6.7104850e+006 + 3.1013610e+006 6.7104850e+006 + 3.1015790e+006 6.7101790e+006 + 3.1015910e+006 6.7100650e+006 + 3.1017660e+006 6.7104190e+006 \ No newline at end of file diff --git a/matlab/ExampleData/spatial clustering of individuals/Example coordinates for the spatial clustering.txt b/matlab/ExampleData/spatial clustering of individuals/Example coordinates for the spatial clustering.txt new file mode 100644 index 0000000..11a0739 --- /dev/null +++ b/matlab/ExampleData/spatial clustering of individuals/Example coordinates for the spatial clustering.txt @@ -0,0 +1,5 @@ + 3.1013610e+006 6.7104850e+006 + 3.1011410e+006 6.7108850e+006 + 3.1015790e+006 6.7101790e+006 + 3.1015910e+006 6.7100650e+006 + 3.1017660e+006 6.7104190e+006 \ No newline at end of file diff --git a/matlab/ExampleData/spatial clustering of individuals/Example data in BAPS format for clustering of diploid individuals.txt b/matlab/ExampleData/spatial clustering of individuals/Example data in BAPS format for clustering of diploid individuals.txt new file mode 100644 index 0000000..c4b064e --- /dev/null +++ b/matlab/ExampleData/spatial clustering of individuals/Example data in BAPS format for clustering of diploid individuals.txt @@ -0,0 +1,10 @@ +-9 102 56 80 100 90 118 90 88 104 1 +-9 102 54 82 102 92 116 90 86 104 1 +88 104 56 84 102 -9 120 90 88 100 2 +86 102 56 80 102 -9 116 90 86 100 2 +88 102 54 80 102 90 116 92 -9 100 3 +88 102 56 80 100 90 118 90 -9 104 3 +80 102 54 82 102 92 116 90 86 104 4 +88 104 56 84 102 92 120 90 88 100 4 +86 102 56 80 -9 90 116 90 86 100 5 +88 102 54 80 -9 90 116 92 86 100 5 diff --git a/matlab/ExampleData/spatial clustering of individuals/Example data in BAPS format for clustering of haploid individuals.txt b/matlab/ExampleData/spatial clustering of individuals/Example data in BAPS format for clustering of haploid individuals.txt new file mode 100644 index 0000000..e0ed559 --- /dev/null +++ b/matlab/ExampleData/spatial clustering of individuals/Example data in BAPS format for clustering of haploid individuals.txt @@ -0,0 +1,5 @@ +88 102 56 80 100 90 118 -9 88 104 1 +80 102 54 82 102 92 116 90 86 104 2 +88 104 56 84 102 -9 120 90 88 100 3 +86 102 56 80 102 90 116 90 86 100 4 +88 102 -9 80 102 90 116 92 86 100 5 diff --git a/matlab/ExampleData/spatial clustering of individuals/Example data in BAPS format for clustering of tetraploid individuals.txt b/matlab/ExampleData/spatial clustering of individuals/Example data in BAPS format for clustering of tetraploid individuals.txt new file mode 100644 index 0000000..00f03f1 --- /dev/null +++ b/matlab/ExampleData/spatial clustering of individuals/Example data in BAPS format for clustering of tetraploid individuals.txt @@ -0,0 +1,20 @@ +88 102 56 80 100 90 118 90 88 104 1 +80 102 54 82 102 92 116 90 86 104 1 +88 102 56 80 100 90 118 90 88 104 1 +80 102 54 82 102 92 116 90 86 104 1 +88 -9 56 84 102 92 120 90 88 100 2 +86 -9 56 80 102 90 116 90 86 102 2 +88 -9 56 84 102 92 120 90 88 100 2 +86 -9 56 80 102 90 116 90 86 100 2 +88 102 54 82 102 90 116 92 86 102 3 +88 102 56 80 100 90 118 90 88 104 3 +88 102 54 80 102 90 116 92 86 102 3 +88 102 56 80 100 90 118 90 88 104 3 +80 102 54 82 102 92 116 -9 86 104 4 +88 104 56 84 102 92 120 -9 88 100 4 +80 102 54 82 102 92 116 -9 86 104 4 +88 104 56 84 102 92 120 -9 88 100 4 +86 102 56 80 102 90 116 90 86 100 5 +88 104 54 80 102 90 116 92 86 100 5 +86 102 56 80 104 90 116 90 86 100 5 +88 102 54 80 102 90 116 92 86 100 5 diff --git a/matlab/ExampleData/spatial clustering of individuals/Example data in FASTA format for clustering of haploid individuals.fasta b/matlab/ExampleData/spatial clustering of individuals/Example data in FASTA format for clustering of haploid individuals.fasta new file mode 100644 index 0000000..9501482 --- /dev/null +++ b/matlab/ExampleData/spatial clustering of individuals/Example data in FASTA format for clustering of haploid individuals.fasta @@ -0,0 +1,20 @@ +>1 +AACGAAACGATCGCGTCACCGGAACGTTGTCCGTCTCGAATAGCACTGTGGGAACGTGTTTTACATTCGT +TAGTAACATGGTCAGCTGCTCATCCGTATT + +>2 +ATCAGCAAACGAGAAGTTGCAGAGGTCTTTGGTTTGAGCATTGCCCCCATACAATCGACTTCTGGCCTGG +AATGCACCACAAACATACCCCACAGGCTCG + +>3 +GCTTTTACTAAGGCCTATCGGATTCAACGTCACTAAGACTCGGCACTAACAGGCCGTTGTAAGCCGCTCT +GTCTGAGTATGGATGGTGGAGGCGGAGCCG + +>4 +ACCTGGACCTCTGTATTAACGGCTGTGATTCTGAGGGGGGTATCGCAGCGCACTTTCTAGCTATATCACG +CAAGGATAAAGTTCACCCATCACGTTGACC + +>5 +ACAATACGTCATCCACACCGCGCCTATGGAAGAATTTGCCCTTTCGGCGACAGCCCATGCTGTCAAGGAG +GTAACATAGCTACCAGGTCCCATTCCAGGA + diff --git a/matlab/ExampleData/spatial clustering of individuals/individual indices in population.txt b/matlab/ExampleData/spatial clustering of individuals/individual indices in population.txt new file mode 100644 index 0000000..ddbf58e --- /dev/null +++ b/matlab/ExampleData/spatial clustering of individuals/individual indices in population.txt @@ -0,0 +1,5 @@ +1 +1 +2 +2 +3 \ No newline at end of file diff --git a/matlab/ExampleData/spatial clustering of individuals/population names.txt b/matlab/ExampleData/spatial clustering of individuals/population names.txt new file mode 100644 index 0000000..c9b6134 --- /dev/null +++ b/matlab/ExampleData/spatial clustering of individuals/population names.txt @@ -0,0 +1,3 @@ +pop_A +popB +C diff --git a/matlab/add_BAPS_to_path.m b/matlab/add_BAPS_to_path.m new file mode 100644 index 0000000..908eee9 --- /dev/null +++ b/matlab/add_BAPS_to_path.m @@ -0,0 +1,41 @@ +baps_path = cd; +addpath(genpath(baps_path)); + +% % addpath(genpath('/home/ai2/murphyk/matlab/BNT')) +% % fails to add directories which only contain directories but no regular files +% % e.g., BNT/inference +% % This bug has been fixed in matlab 6.5 +% +% global BNT_HOME +% % BNT_HOME = 'C:\kpmurphy\matlab\BNT'; +% % BNT_HOME = '/home/ai2/murphyk/matlab/BNT'; +% BNT_HOME = 'D:\Matlab\BNT'; +% +% +% files = {'CPDs', 'general', 'misc', 'graph', 'graph/C', 'Graphics', 'stats1', 'stats2', ... +% 'netlab2', 'HMM', 'Kalman', 'Entropic', 'Entropic/Brand', ... +% 'inference', 'inference/static', 'inference/dynamic', 'inference/online', ... +% 'learning', 'potentials', 'potentials/Tables', ... +% 'examples/dynamic', 'examples/dynamic/HHMM', 'examples/dynamic/HHMM/Square', ... +% 'examples/dynamic/HHMM/Map', ... +% 'examples/dynamic/HHMM/Motif', 'examples/dynamic/SLAM', 'examples/limids', ... +% 'examples/static', 'examples/static/Misc', 'examples/static/Models', ... +% 'examples/static/Belprop', ... +% 'examples/static/Zoubin', 'examples/static/HME', 'examples/static/SCG', ... +% 'examples/static/dtree', ... +% 'examples/static/StructLearn', 'examples/static/fgraph'}; +% +% +% %eval(sprintf('addpath %s', BNT_HOME)); +% eval(sprintf('addpath ''%s'' ', BNT_HOME)); +% % to cope with filenames with spaces, we must use quotes +% % Can use fullfile to get either / for unix or \ for windows +% % use 'isunix' to determine the operating system +% +% for i=1:length(files) +% f = files{i}; +% eval(sprintf('addpath ''%s''/%s', BNT_HOME, f)); +% end +% +% +% diff --git a/matlab/admixture/admix1.m b/matlab/admixture/admix1.m new file mode 100644 index 0000000..beda82a --- /dev/null +++ b/matlab/admixture/admix1.m @@ -0,0 +1,846 @@ +function admix1(tietue) +% Jos tietue == -1, ladataan mixture result file. +% Muussa tapauksessa saadaan tarvittavat muuttujat +% tietueen kentist? + +% set for debugging, must be disabled before publishing. +% rand('state',0) + +global COUNTS; global PARTITION; global SUMCOUNTS; +clearGlobalVars; + +if (~isstruct(tietue)) + [filename, pathname] = uigetfile('*.mat', 'Load mixture result file'); + if (filename==0 & pathname==0), return; + else + disp('---------------------------------------------------'); + disp(['Reading mixture result from: ',[pathname filename],'...']); + end + pause(0.0001); + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + + struct_array = load([pathname filename]); + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'PARTITION') | ~isfield(c,'rowsFromInd') + disp('Incorrect file format'); + return + end + elseif isfield(struct_array,'PARTITION') %Mideva versio + c = struct_array; + if ~isfield(c,'rowsFromInd') + disp('Incorrect file format'); + return + end + else + disp('Incorrect file format'); + return; + end + + if isfield(c, 'gene_lengths') && ... + (strcmp(c.mixtureType,'linear_mix') | ... + strcmp(c.mixtureType,'codon_mix')) % if the mixture is from a linkage model + % Redirect the call to the linkage admixture function. + c.data = noIndex(c.data,c.noalle); % call function noindex to remove the index column + linkage_admix(c); + return + end + + PARTITION = c.PARTITION; COUNTS = c.COUNTS; SUMCOUNTS = c.SUMCOUNTS; + alleleCodes = c.alleleCodes; adjprior = c.adjprior; popnames = c.popnames; + rowsFromInd = c.rowsFromInd; data = c.data; npops = c.npops; noalle = c.noalle; +else + PARTITION = tietue.PARTITION; + COUNTS = tietue.COUNTS; + SUMCOUNTS = tietue.SUMCOUNTS; + alleleCodes = tietue.alleleCodes; + adjprior = tietue.adjprior; + popnames = tietue.popnames; + rowsFromInd = tietue.rowsFromInd; + data = double(tietue.data); + npops = tietue.npops; + noalle = tietue.noalle; +end + +answers = inputdlg({['Input the minimum size of a population that will'... + ' be taken into account when admixture is estimated.']},... + 'Input minimum population size',[1],... + {'5'}); +if isempty(answers) return; end +alaRaja = str2num(answers{1,1}); +[npops] = poistaLiianPienet(npops, rowsFromInd, alaRaja); + +nloci = size(COUNTS,2); +ninds = size(data,1)/rowsFromInd; + +answers = inputdlg({['Input number of iterations']},'Input',[1],{'50'}); +if isempty(answers) return; end +iterationCount = str2num(answers{1,1}); + +answers = inputdlg({['Input number of reference individuals from each population']},'Input',[1],{'50'}); +if isempty(answers) nrefIndsInPop = 50; +else nrefIndsInPop = str2num(answers{1,1}); +end + +answers = inputdlg({['Input number of iterations for reference individuals']},'Input',[1],{'10'}); +if isempty(answers) return; end +iterationCountRef = str2num(answers{1,1}); + + +% First calculate log-likelihood ratio for all individuals: +likelihood = zeros(ninds,1); +allfreqs = computeAllFreqs2(noalle); +for ind = 1:ninds + omaFreqs = computePersonalAllFreqs(ind, data, allfreqs, rowsFromInd); + osuusTaulu = zeros(1,npops); + if PARTITION(ind)==0 + % Yksil?on outlier + elseif PARTITION(ind)~=0 + if PARTITION(ind)>0 + osuusTaulu(PARTITION(ind)) = 1; + else + % Yksilöt, joita ei ole sijoitettu mihinkään koriin. + arvot = zeros(1,npops); + for q=1:npops + osuusTaulu = zeros(1,npops); + osuusTaulu(q) = 1; + arvot(q) = computeIndLogml(omaFreqs, osuusTaulu); + end + [iso_arvo, isoimman_indeksi] = max(arvot); + osuusTaulu = zeros(1,npops); + osuusTaulu(isoimman_indeksi) = 1; + PARTITION(ind)=isoimman_indeksi; + end + logml = computeIndLogml(omaFreqs, osuusTaulu); + logmlAlku = logml; + for osuus = [0.5 0.25 0.05 0.01] + [osuusTaulu, logml] = etsiParas(osuus, osuusTaulu, omaFreqs, logml); + end + logmlLoppu = logml; + likelihood(ind) = logmlLoppu-logmlAlku; + end +end + +% Analyze further only individuals who have log-likelihood ratio larger than 3: +to_investigate = (find(likelihood>3))'; +disp('Possibly admixed individuals: '); +for i = 1:length(to_investigate) + disp(num2str(to_investigate(i))); +end +disp(' '); +disp('Populations for possibly admixed individuals: '); +admix_populaatiot = unique(PARTITION(to_investigate)); +for i = 1:length(admix_populaatiot) + disp(num2str(admix_populaatiot(i))); +end + +% THUS, there are two types of individuals, who will not be analyzed with +% simulated allele frequencies: those who belonged to a mini-population +% which was removed, and those who have log-likelihood ratio less than 3. +% The value in the PARTITION for the first kind of individuals is 0. The +% second kind of individuals can be identified, because they do not +% belong to "to_investigate" array. When the results are presented, the +% first kind of individuals are omitted completely, while the second kind +% of individuals are completely put to the population, where they ended up +% in the mixture analysis. These second type of individuals will have a +% unit p-value. + + +% Simulate allele frequencies a given number of times and save the average +% result to "proportionsIt" array. + +proportionsIt = zeros(ninds,npops); +for iterationNum = 1:iterationCount + disp(['Iter: ' num2str(iterationNum)]); + allfreqs = simulateAllFreqs(noalle); % Allele frequencies on this iteration. + + for ind=to_investigate + %disp(num2str(ind)); + omaFreqs = computePersonalAllFreqs(ind, data, allfreqs, rowsFromInd); + osuusTaulu = zeros(1,npops); + if PARTITION(ind)==0 + % Yksil?on outlier + elseif PARTITION(ind)~=0 + if PARTITION(ind)>0 + osuusTaulu(PARTITION(ind)) = 1; + else + % Yksilöt, joita ei ole sijoitettu mihinkään koriin. + arvot = zeros(1,npops); + for q=1:npops + osuusTaulu = zeros(1,npops); + osuusTaulu(q) = 1; + arvot(q) = computeIndLogml(omaFreqs, osuusTaulu); + end + [iso_arvo, isoimman_indeksi] = max(arvot); + osuusTaulu = zeros(1,npops); + osuusTaulu(isoimman_indeksi) = 1; + PARTITION(ind)=isoimman_indeksi; + end + logml = computeIndLogml(omaFreqs, osuusTaulu); + + for osuus = [0.5 0.25 0.05 0.01] + [osuusTaulu, logml] = etsiParas(osuus, osuusTaulu, omaFreqs, logml); + end + end + proportionsIt(ind,:) = proportionsIt(ind,:).*(iterationNum-1) + osuusTaulu; + proportionsIt(ind,:) = proportionsIt(ind,:)./iterationNum; + end +end + +%disp(['Creating ' num2str(nrefIndsInPop) ' reference individuals from ']); +%disp('each population.'); + +%allfreqs = simulateAllFreqs(noalle); % Simuloidaan alleelifrekvenssisetti +allfreqs = computeAllFreqs2(noalle); % Koitetaan tällaista. + + +% Initialize the data structures, which are required in taking the missing +% data into account: +n_missing_levels = zeros(npops,1); % number of different levels of "missingness" in each pop (max 3). +missing_levels = zeros(npops,3); % the mean values for different levels. +missing_level_partition = zeros(ninds,1); % level of each individual (one of the levels of its population). +for i=1:npops + inds = find(PARTITION==i); + % Proportions of non-missing data for the individuals: + non_missing_data = zeros(length(inds),1); + for j = 1:length(inds) + ind = inds(j); + non_missing_data(j) = length(find(data((ind-1)*rowsFromInd+1:ind*rowsFromInd,:)>0)) ./ (rowsFromInd*nloci); + end + if all(non_missing_data>0.9) + n_missing_levels(i) = 1; + missing_levels(i,1) = mean(non_missing_data); + missing_level_partition(inds) = 1; + else + [ordered, ordering] = sort(non_missing_data); + %part = learn_simple_partition(ordered, 0.05); + part = learn_partition_modified(ordered); + aux = sortrows([part ordering],2); + part = aux(:,1); + missing_level_partition(inds) = part; + n_levels = length(unique(part)); + n_missing_levels(i) = n_levels; + for j=1:n_levels + missing_levels(i,j) = mean(non_missing_data(find(part==j))); + end + end +end + +% Create and analyse reference individuals for populations +% with potentially admixed individuals: +refTaulu = zeros(npops,100,3); +for pop = admix_populaatiot' + + for level = 1:n_missing_levels(pop) + + potential_inds_in_this_pop_and_level = ... + find(PARTITION==pop & missing_level_partition==level &... + likelihood>3); % Potential admix individuals here. + + if ~isempty(potential_inds_in_this_pop_and_level) + + %refData = simulateIndividuals(nrefIndsInPop,rowsFromInd,allfreqs); + refData = simulateIndividuals(nrefIndsInPop, rowsFromInd, allfreqs, ... + pop, missing_levels(pop,level)); + + disp(['Analysing the reference individuals from pop ' num2str(pop) ' (level ' num2str(level) ').']); + refProportions = zeros(nrefIndsInPop,npops); + for iter = 1:iterationCountRef + %disp(['Iter: ' num2str(iter)]); + allfreqs = simulateAllFreqs(noalle); + + for ind = 1:nrefIndsInPop + omaFreqs = computePersonalAllFreqs(ind, refData, allfreqs, rowsFromInd); + osuusTaulu = zeros(1,npops); + osuusTaulu(pop)=1; + logml = computeIndLogml(omaFreqs, osuusTaulu); + for osuus = [0.5 0.25 0.05 0.01] + [osuusTaulu, logml] = etsiParas(osuus, osuusTaulu, omaFreqs, logml); + end + refProportions(ind,:) = refProportions(ind,:).*(iter-1) + osuusTaulu; + refProportions(ind,:) = refProportions(ind,:)./iter; + end + end + for ind = 1:nrefIndsInPop + omanOsuus = refProportions(ind,pop); + if round(omanOsuus*100)==0 + omanOsuus = 0.01; + end + if abs(omanOsuus)<1e-5 + omanOsuus = 0.01; + end + refTaulu(pop, round(omanOsuus*100),level) = refTaulu(pop, round(omanOsuus*100),level)+1; + end + end + end +end + +% Rounding of the results: +proportionsIt = proportionsIt.*100; proportionsIt = round(proportionsIt); +proportionsIt = proportionsIt./100; +for ind = 1:ninds + if ~any(to_investigate==ind) + if PARTITION(ind)>0 + proportionsIt(ind,PARTITION(ind))=1; + end + else + % In case of a rounding error, the sum is made equal to unity by + % fixing the largest value. + if (PARTITION(ind)>0) & (sum(proportionsIt(ind,:)) ~= 1) + [isoin,indeksi] = max(proportionsIt(ind,:)); + erotus = sum(proportionsIt(ind,:))-1; + proportionsIt(ind,indeksi) = isoin-erotus; + end + end +end + +% Calculate p-value for each individual: +uskottavuus = zeros(ninds,1); +for ind = 1:ninds + pop = PARTITION(ind); + if pop==0 % Individual is outlier + uskottavuus(ind)=1; + elseif isempty(find(to_investigate==ind)) + % Individual had log-likelihood ratio<3 + uskottavuus(ind)=1; + else + omanOsuus = proportionsIt(ind,pop); + if abs(omanOsuus)<1e-5 + omanOsuus = 0.01; + end + if round(omanOsuus*100)==0 + omanOsuus = 0.01; + end + level = missing_level_partition(ind); + refPienempia = sum(refTaulu(pop, 1:round(100*omanOsuus), level)); + uskottavuus(ind) = refPienempia / nrefIndsInPop; + end +end + +tulostaAdmixtureTiedot(proportionsIt, uskottavuus, alaRaja, iterationCount); + +viewPartition(proportionsIt, popnames); + +talle = questdlg(['Do you want to save the admixture results?'], ... + 'Save results?','Yes','No','Yes'); +if isequal(talle,'Yes') + %waitALittle; + [filename, pathname] = uiputfile('*.mat','Save results as'); + + + if (filename == 0) & (pathname == 0) + % Cancel was pressed + return + else % copy 'baps4_output.baps' into the text file with the same name. + if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',[pathname filename '.txt']) + delete('baps4_output.baps') + end + end + + + if (~isstruct(tietue)) + c.proportionsIt = proportionsIt; + c.pvalue = uskottavuus; % Added by Jing + c.mixtureType = 'admix'; % Jing + c.admixnpops = npops; +% save([pathname filename], 'c'); + save([pathname filename], 'c', '-v7.3'); % added by Lu Cheng, 08.06.2012 + else + tietue.proportionsIt = proportionsIt; + tietue.pvalue = uskottavuus; % Added by Jing + tietue.mixtureType = 'admix'; + tietue.admixnpops = npops; +% save([pathname filename], 'tietue'); + save([pathname filename], 'tietue', '-v7.3'); % added by Lu Cheng, 08.06.2012 + end +end + + +%---------------------------------------------------------------------------- + + +function [npops] = poistaLiianPienet(npops, rowsFromInd, alaraja) +% Muokkaa tulokset muotoon, jossa outlier yksilöt on +% poistettu. Tarkalleen ottaen poistaa ne populaatiot, +% joissa on vähemmän kuin 'alaraja':n verran yksilöit? + +global PARTITION; +global COUNTS; +global SUMCOUNTS; + +popSize=zeros(1,npops); +for i=1:npops + popSize(i)=length(find(PARTITION==i)); +end +miniPops = find(popSize0))); +for n=1:length(korit) + kori = korit(n); + yksilot = find(PARTITION==kori); + PARTITION(yksilot) = n; +end +COUNTS(:,:,miniPops) = []; +SUMCOUNTS(miniPops,:) = []; + +npops = npops-length(miniPops); + +%------------------------------------------------------------------------ + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; + +%-------------------------------------------------------- + + +function allFreqs = computeAllFreqs2(noalle) +% Lisää a priori jokaista alleelia +% joka populaation joka lokukseen j 1/noalle(j) verran. + +global COUNTS; +global SUMCOUNTS; + +max_noalle = size(COUNTS,1); +nloci = size(COUNTS,2); +npops = size(COUNTS,3); + +sumCounts = SUMCOUNTS+ones(size(SUMCOUNTS)); +sumCounts = reshape(sumCounts', [1, nloci, npops]); +sumCounts = repmat(sumCounts, [max_noalle, 1 1]); + +prioriAlleelit = zeros(max_noalle,nloci); +for j=1:nloci + prioriAlleelit(1:noalle(j),j) = 1/noalle(j); +end +prioriAlleelit = repmat(prioriAlleelit, [1,1,npops]); +counts = COUNTS + prioriAlleelit; +allFreqs = counts./sumCounts; + + +function allfreqs = simulateAllFreqs(noalle) +% Lisää jokaista alleelia joka populaation joka lokukseen j 1/noalle(j) +% verran. Näin saatuja counts:eja vastaavista Dirichlet-jakaumista +% simuloidaan arvot populaatioiden alleelifrekvensseille. + +global COUNTS; + +max_noalle = size(COUNTS,1); +nloci = size(COUNTS,2); +npops = size(COUNTS,3); + +prioriAlleelit = zeros(max_noalle,nloci); +for j=1:nloci + prioriAlleelit(1:noalle(j),j) = 1/noalle(j); +end +prioriAlleelit = repmat(prioriAlleelit, [1,1,npops]); +counts = COUNTS + prioriAlleelit; +allfreqs = zeros(size(counts)); + +for i=1:npops + for j=1:nloci + simuloidut = randdir(counts(1:noalle(j),j,i) , noalle(j)); + allfreqs(1:noalle(j),j,i) = simuloidut; + end +end + +%-------------------------------------------------------------------------- + + +function refData = simulateIndividuals(n,rowsFromInd,allfreqs,pop, missing_level) +% simulate n individuals from population pop, such that approximately +% proportion "missing_level" of the alleles are present. + +nloci = size(allfreqs,2); + +refData = zeros(n*rowsFromInd,nloci); +counter = 1; % which row will be generated next. + +for ind = 1:n + for loc = 1:nloci + for k=0:rowsFromInd-1 + if randarvo); +all = min(isommat); + + +%-------------------------------------------------------------------------- + + +function omaFreqs = computePersonalAllFreqs(ind, data, allFreqs, rowsFromInd) +% Laskee npops*(rowsFromInd*nloci) taulukon, jonka kutakin saraketta +% vastaa yksilön ind alleeli. Eri rivit ovat alleelin alkuperäfrekvenssit +% eri populaatioissa. Jos yksilölt?puuttuu jokin alleeli, niin vastaavaan +% kohtaa tulee sarake ykkösi? + +global COUNTS; +nloci = size(COUNTS,2); +npops = size(COUNTS,3); + +rows = data(computeRows(rowsFromInd, ind, 1),:); + +omaFreqs = zeros(npops, (rowsFromInd*nloci)); +pointer = 1; +for loc=1:size(rows,2) + for all=1:size(rows,1) + if rows(all,loc)>=0 + try, + omaFreqs(:,pointer) = ... + reshape(allFreqs(rows(all,loc),loc,:), [npops,1]); + catch + a=0; + end + else + omaFreqs(:,pointer) = ones(npops,1); + end + pointer = pointer+1; + end +end + + +%--------------------------------------------------------------------------- + + +function loggis = computeIndLogml(omaFreqs, osuusTaulu) +% Palauttaa yksilön logml:n, kun oletetaan yksilön alkuperät +% määritellyiksi kuten osuusTaulu:ssa. + +apu = repmat(osuusTaulu', [1 size(omaFreqs,2)]); +apu = apu .* omaFreqs; +apu = sum(apu); + +apu = log(apu); + +loggis = sum(apu); + + +%-------------------------------------------------------------------------- + + +function osuusTaulu = suoritaMuutos(osuusTaulu, osuus, indeksi) +% Päivittää osuusTaulun muutoksen jälkeen. + +global COUNTS; +npops = size(COUNTS,3); + +i1 = rem(indeksi,npops); +if i1==0, i1 = npops; end; +i2 = ceil(indeksi / npops); + +osuusTaulu(i1) = osuusTaulu(i1)-osuus; +osuusTaulu(i2) = osuusTaulu(i2)+osuus; + + +%------------------------------------------------------------------------- + + +function [osuusTaulu, logml] = etsiParas(osuus, osuusTaulu, omaFreqs, logml) + +ready = 0; +while ready ~= 1 + muutokset = laskeMuutokset4(osuus, osuusTaulu, omaFreqs, logml); + [maxMuutos, indeksi] = max(muutokset(1:end)); + if maxMuutos>0 + osuusTaulu = suoritaMuutos(osuusTaulu, osuus, indeksi); + logml = logml + maxMuutos; + else + ready = 1; + end +end + + + +%--------------------------------------------------------------------------- + + +function muutokset = laskeMuutokset4(osuus, osuusTaulu, omaFreqs, logml) +% Palauttaa npops*npops taulun, jonka alkio (i,j) kertoo, mik?on +% muutos logml:ss? mikäli populaatiosta i siirretään osuuden verran +% todennäköisyysmassaa populaatioon j. Mikäli populaatiossa i ei ole +% mitään siirrettävää, on vastaavassa kohdassa rivi nollia. + +global COUNTS; +npops = size(COUNTS,3); + +notEmpty = find(osuusTaulu>0.005); +muutokset = zeros(npops); +empties = ~notEmpty; + +for i1=notEmpty + + osuusTaulu(i1) = osuusTaulu(i1)-osuus; + + for i2 = [1:i1-1 i1+1:npops] + osuusTaulu(i2) = osuusTaulu(i2)+osuus; + loggis = computeIndLogml(omaFreqs, osuusTaulu); + muutokset(i1,i2) = loggis-logml; + osuusTaulu(i2) = osuusTaulu(i2)-osuus; + end + + osuusTaulu(i1) = osuusTaulu(i1)+osuus; +end + + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); + + +%-------------------------------------------------------------------------- + + +function tulostaAdmixtureTiedot(proportions, uskottavuus, alaRaja, niter) +h0 = findobj('Tag','filename1_text'); +inputf = get(h0,'String'); +h0 = findobj('Tag','filename2_text'); +outf = get(h0,'String'); clear h0; + +if length(outf)>0 + fid = fopen(outf,'a'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +ninds = length(uskottavuus); +npops = size(proportions,2); +disp(' '); +dispLine; +disp('RESULTS OF ADMIXTURE ANALYSIS BASED'); +disp('ON MIXTURE CLUSTERING OF INDIVIDUALS'); +disp(['Data file: ' inputf]); +disp(['Number of individuals: ' num2str(ninds)]); +disp(['Results based on ' num2str(niter) ' simulations from posterior allele frequencies.']); +disp(' '); +if fid ~= -1 + fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['--------------------------------------------']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['RESULTS OF ADMIXTURE ANALYSIS BASED']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['ON MIXTURE CLUSTERING OF INDIVIDUALS']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Data file: ' inputf]); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Number of individuals: ' num2str(ninds)]); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Results based on ' num2str(niter) ' simulations from posterior allele frequencies.']); fprintf(fid, '\n'); + fprintf(fid, '\n'); +end + +ekaRivi = blanks(6); +for pop = 1:npops + ekaRivi = [ekaRivi blanks(3-floor(log10(pop))) num2str(pop) blanks(2)]; +end +ekaRivi = [ekaRivi blanks(1) 'p']; % Added on 29.08.06 +disp(ekaRivi); +for ind = 1:ninds + rivi = [num2str(ind) ':' blanks(4-floor(log10(ind)))]; + if any(proportions(ind,:)>0) + for pop = 1:npops-1 + rivi = [rivi proportion2str(proportions(ind,pop)) blanks(2)]; + end + rivi = [rivi proportion2str(proportions(ind,npops)) ': ']; + rivi = [rivi ownNum2Str(uskottavuus(ind))]; + end + disp(rivi); + if fid ~= -1 + fprintf(fid,'%s \n',[rivi]); fprintf(fid,'\n'); + end +end +if fid ~= -1 + fclose(fid); +else + diary off +end + +%------------------------------------------------------ + +function str = proportion2str(prob) +%prob belongs to [0.00, 0.01, ... ,1]. +%str is a 4-mark presentation of proportion. + +if abs(prob)<1e-3 + str = '0.00'; +elseif abs(prob-1) < 1e-3; + str = '1.00'; +else + prob = round(100*prob); + if prob<10 + str = ['0.0' num2str(prob)]; + else + str = ['0.' num2str(prob)]; + end; +end; + +%------------------------------------------------------- + +function g=randga(a,b) +flag = 0; +if a>1 +c1 = a-1; c2 = (a-(1/(6*a)))/c1; c3 = 2/c1; c4 = c3+2; c5 = 1/sqrt(a); +U1=-1; +while flag == 0, +if a<=2.5, +U1=rand;U2=rand; +else +while ~(U1>0 & U1<1), +U1=rand;U2=rand; +U1 = U2 + c5*(1-1.86*U1); +end %while +end %if +W = c2*U2/U1; +if c3*U1+W+(1/W)<=c4, +flag = 1; +g = c1*W/b; +elseif c3*log(U1)-log(W)+W<1, +flag = 1; +g = c1*W/b; +else +U1=-1; +end %if +end %while flag +elseif a==1 +g=sum(-(1/b)*log(rand(a,1))); +else +while flag == 0, +U = rand(2,1); +if U(1)>exp(1)/(a+exp(1)), +g = -log(((a+exp(1))*(1-U(1)))/(a*exp(1))); +if U(2)<=g^(a-1), +flag = 1; +end %if +else +g = ((a+exp(1))*U(1)/((exp(1))^(1/a))); +if U(2)<=exp(-g), +flag = 1; +end %if +end %if +end %while flag +g=g/b; +end %if; + + +%------------------------------------------------- + +function svar=randdir(counts,nc) +% Käyttöesim randdir([10;30;60],3) + +svar=zeros(nc,1); +for i=1:nc + svar(i,1)=randga(counts(i,1),1); +end +svar=svar/sum(svar); + +%------------------------------------------------------------------------------------- + + +function rows = computeRows(rowsFromInd, inds, ninds) +% Individuals inds have been given. The function returns a vector, +% containing the indices of the rows, which contain data from the +% individuals. + +rows = inds(:, ones(1,rowsFromInd)); +rows = rows*rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [ninds 1]); +rows = rows - miinus; +rows = reshape(rows', [1,rowsFromInd*ninds]); + +%-------------------------------------------------------------------------- +%----- + +function str = ownNum2Str(number) + +absolute = abs(number); + +if absolute < 1000 + str = num2str(number); +elseif absolute < 10000000 + first_three = rem(number,1000); + next_four = (number - first_three) /1000; + first_three = abs(first_three); + if first_three<10 + first_three = ['00' num2str(first_three)]; + elseif first_three<100 + first_three = ['0' num2str(first_three)]; + else + first_three = num2str(first_three); + end; + str = [num2str(next_four) first_three]; +elseif absolute < 100000000 + first_four = rem(number,10000); + next_four = (number - first_four) /10000; + first_four = abs(first_four); + if first_four<10 + first_four = ['000' num2str(first_four)]; + elseif first_four<100 + first_four = ['00' num2str(first_four)]; + elseif first_four<1000 + first_four = ['0' num2str(first_four)]; + else + first_four = num2str(first_four); + end; + str = [num2str(next_four) first_four]; +else + str = num2str(number); +end; + +%------------------------------------------------ + + +function part = learn_partition_modified(ordered) +% This function is called only if some individual has less than 90 per cent +% non-missing data. The function uses fuzzy clustering for the "non-missingness" +% values, finding maximum three clusters. If two of the found clusters are such +% that all the values are >0.9, then those two are further combined. + +part = learn_simple_partition(ordered,0.05); +nclust = length(unique(part)); +if nclust==3 + mini_1 = min(ordered(find(part==1))); + mini_2 = min(ordered(find(part==2))); + mini_3 = min(ordered(find(part==3))); + + if mini_1>0.9 & mini_2>0.9 + part(find(part==2)) = 1; + part(find(part==3)) = 2; + + elseif mini_1>0.9 & mini_3>0.9 + part(find(part==3)) = 1; + + elseif mini_2>0.9 & mini_3>0.9 + % This is the one happening in practice, since the values are + % ordered, leading to mini_1 <= mini_2 <= mini_3 + part(find(part==3)) = 2; + end +end \ No newline at end of file diff --git a/matlab/admixture/admix2.m b/matlab/admixture/admix2.m new file mode 100644 index 0000000..f682572 --- /dev/null +++ b/matlab/admixture/admix2.m @@ -0,0 +1,622 @@ +function admix2 + +global PARTITION; global COUNTS; +global SUMCOUNTS; +clearGlobalVars; + +input_type = questdlg('Specify the format of your data: ',... + 'Specify Data Format', ... + 'BAPS-format', 'GenePop-format', 'BAPS-format'); + +switch input_type + +case 'BAPS-format' + waitALittle; + [filename, pathname] = uigetfile('*.txt', 'Load data in BAPS-format'); + if filename==0 + return; + end + + data = load([pathname filename]); + ninds = testaaOnkoKunnollinenBapsData(data); %TESTAUS + if (ninds==0) + disp('Incorrect Data-file.'); + return; + end + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + waitALittle; + [filename, pathname] = uigetfile('*.txt', 'Load Partition'); + if filename==0 + return; + end + PARTITION = load([pathname filename]); + if ~(size(PARTITION,2)==1) | ~(size(PARTITION,1)==ninds) + disp('Incorrect Partition-file.'); + return; + end + + input_pops = questdlg(['When using data which are in BAPS-format, '... + 'you can specify the sampling populations of the individuals by '... + 'giving two additional files: one containing the names of the '... + 'populations, the other containing the indices of the first '... + 'individuals of the populations. Do you wish to specify the '... + 'sampling populations?'], ... + 'Specify sampling populations?',... + 'Yes', 'No', 'No'); + if isequal(input_pops,'Yes') + waitALittle; + [namefile, namepath] = uigetfile('*.txt', 'Load population names'); + if namefile==0 + kysyToinen = 0; + else + kysyToinen = 1; + end + if kysyToinen==1 + waitALittle; + [indicesfile, indicespath] = uigetfile('*.txt', 'Load population indices'); + if indicesfile==0 + popnames = []; + else + popnames = initPopNames([namepath namefile],[indicespath indicesfile]); + end + else + popnames = []; + end + else + popnames = []; + end + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + data = data(:, 1:end-1); + npops = length(unique(PARTITION(find(PARTITION>=0)))); + + +case 'GenePop-format' + waitALittle; + [filename, pathname] = uigetfile('*.txt', 'Load data in GenePop-format'); + if filename==0 + return; + end + + kunnossa = testaaGenePopData([pathname filename]); + if kunnossa==0 + return + end + + [data,popnames]=lueGenePopData([pathname filename]); + + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + data = data(:, 1:end-1); + + npops = size(popnames,1); + ninds = size(data,1)/rowsFromInd; + PARTITION = zeros(ninds,1); + ind = 1; + for pop = 1:npops-1 + while (ind < popnames{pop+1,2}) + PARTITION(ind) = pop; + ind = ind+1; + end + end + while (ind <= ninds) + PARTITION(ind) = npops; + ind = ind+1; + end + + all_in_text = questdlg(['Do you wish to use also the last population in the ',... + 'data to define one more population for admixture analysis: '],... + 'Define a population based on the last population in the data?', ... + 'Yes', 'No', 'Yes'); + if isequal(all_in_text, 'No') + PARTITION(find(PARTITION==npops)) = -1; + npops = npops-1; + end + otherwise return +end + +initialPartition = PARTITION(:,ones(1,rowsFromInd))'; +initialPartition = initialPartition(:); +[sumcounts, counts, logml] = ... + initialCounts(initialPartition, data, npops, rowsFromInd, noalle); +COUNTS = counts; SUMCOUNTS = sumcounts; + +clear('initialPartition', 'counts', 'sumcounts', ... + 'filename', 'ind', 'input_type', ... + 'logml', 'ninds', 'pathname', 'pop', 'priorTerm'); +clear('indicesfile','indicespath','input_pops','kysyToinen',... + 'namefile','namepath'); +c.PARTITION = PARTITION; c.COUNTS = COUNTS; c.SUMCOUNTS = SUMCOUNTS; +c.alleleCodes = alleleCodes; c.adjprior = adjprior; c.popnames = popnames; +c.rowsFromInd = rowsFromInd; c.data = data; c.npops = npops; c.noalle = noalle; +admix1(c); + +%------------------------------------------------------------------------ + + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; + +%-------------------------------------------------------- + +function ninds = testaaOnkoKunnollinenBapsData(data) +%Tarkastaa onko viimeisessä sarakkeessa kaikki +%luvut 1,2,...,n johonkin n:ään asti. +%Tarkastaa lisäksi, että on vähintään 2 saraketta. +if size(data,1)<2 + ninds = 0; return; +end +lastCol = data(:,end); +ninds = max(lastCol); +if ~isequal((1:ninds)',unique(lastCol)) + ninds = 0; return; +end + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile, indexFile) +%Palauttaa tyhjän, mikäli nimitiedosto ja indeksitiedosto +% eivät olleet yhtä pitkiä. + +popnames = []; +indices = load(indexFile); + +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) & ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +if length(names) ~= length(indices) + disp('The number of population names must be equal to the number of '); + disp('entries in the file specifying indices of the first individuals of '); + disp('each population.'); + return; +end + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = indices(i); +end + +%--------------------------------------------------------------------------------------- + + +function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, miltä yksilöltä +% kyseinen rivi on peräisin. Funktio tutkii ensin, että montako +% riviä maksimissaan on peräisin yhdeltä yksilöltä, jolloin saadaan +% tietää onko kyseessä haploidi, diploidi jne... Tämän jälkeen funktio +% lisää tyhjiä rivejä niille yksilöille, joilta on peräisin vähemmän +% rivejä kuin maksimimäärä. +% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin +% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytössä oleva koodi. +% Tämän jälkeen funktio muuttaa alleelikoodit siten, että yhden lokuksen j +% koodit saavat arvoja välillä 1,...,noalle(j). +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +dataApu = []; nollat = []; isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +nrows = size(data,1); +ncols = size(data,2); +rowsFromInd = zeros(nind,1); +for i=1:nind + rowsFromInd(i) = length(find(data(:,end)==i)); +end +maxRowsFromInd = max(rowsFromInd); +a = -999; +emptyRow = repmat(a, 1, ncols); +lessThanMax = find(rowsFromInd < maxRowsFromInd); +missingRows = maxRowsFromInd*nind - nrows; +data = [data; zeros(missingRows, ncols)]; +pointer = 1; +for ind=lessThanMax' %Käy läpi ne yksilöt, joilta puuttuu rivejä + miss = maxRowsFromInd-rowsFromInd(ind); % Tältä yksilöltä puuttuvien lkm. + for j=1:miss + rowToBeAdded = emptyRow; + rowToBeAdded(end) = ind; + data(nrows+pointer, :) = rowToBeAdded; + pointer = pointer+1; + end +end +data = sortrows(data, ncols); % Sorttaa yksilöiden mukaisesti +newData = data; +rowsFromInd = maxRowsFromInd; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + +%-------------------------------------------------------------------- + + +function kunnossa = testaaGenePopData(tiedostonNimi) +% kunnossa == 0, jos data ei ole kelvollinen genePop data. +% Muussa tapauksessa kunnossa == 1. + +kunnossa = 0; +fid = fopen(tiedostonNimi); +line1 = fgetl(fid); %ensimmäinen rivi +line2 = fgetl(fid); %toinen rivi +line3 = fgetl(fid); %kolmas + +if (isequal(line1,-1) | isequal(line2,-1) | isequal(line3,-1)) + disp('Incorrect file format 1168'); fclose(fid); + return +end +if (testaaPop(line1)==1 | testaaPop(line2)==1) + disp('Incorrect file format 1172'); fclose(fid); + return +end +if testaaPop(line3)==1 + %2 rivi tällöin lokusrivi + nloci = rivinSisaltamienMjonojenLkm(line2); + line4 = fgetl(fid); + if isequal(line4,-1) + disp('Incorrect file format 1180'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin neljä täytyy sisältää pilkku. + disp('Incorrect file format 1185'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, että pysähtyy + pointer = pointer+1; + end + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1195'); fclose(fid); + return + end +else + line = fgetl(fid); + lineNumb = 4; + while (testaaPop(line)~=1 & ~isequal(line,-1)) + line = fgetl(fid); + lineNumb = lineNumb+1; + end + if isequal(line,-1) + disp('Incorrect file format 1206'); fclose(fid); + return + end + nloci = lineNumb-2; + line4 = fgetl(fid); %Eka rivi pop sanan jälkeen + if isequal(line4,-1) + disp('Incorrect file format 1212'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin täytyy sisältää pilkku. + disp('Incorrect file format 1217'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, että pysähtyy. + pointer = pointer+1; + end + + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1228'); fclose(fid); + return + end +end +kunnossa = 1; +fclose(fid); + +%------------------------------------------------------ + + +function [data, popnames] = lueGenePopData(tiedostonNimi) + +fid = fopen(tiedostonNimi); +line = fgetl(fid); %ensimmäinen rivi +line = fgetl(fid); %toinen rivi +count = rivinSisaltamienMjonojenLkm(line); + +line = fgetl(fid); +lokusRiveja = 1; +while (testaaPop(line)==0) + lokusRiveja = lokusRiveja+1; + line = fgetl(fid); +end + +if lokusRiveja>1 + nloci = lokusRiveja; +else + nloci = count; +end + +popnames = cell(10,2); +data = zeros(100, nloci+1); +nimienLkm=0; +ninds=0; +poimiNimi=1; +digitFormat = -1; +while line ~= -1 + line = fgetl(fid); + + if poimiNimi==1 + %Edellinen rivi oli 'pop' + nimienLkm = nimienLkm+1; + ninds = ninds+1; + if nimienLkm>size(popnames,1); + popnames = [popnames; cell(10,2)]; + end + nimi = lueNimi(line); + if digitFormat == -1 + digitFormat = selvitaDigitFormat(line); + divider = 10^digitFormat; + end + popnames{nimienLkm, 1} = {nimi}; %Näin se on greedyMix:issäkin?!? + popnames{nimienLkm, 2} = ninds; + poimiNimi=0; + + data = addAlleles(data, ninds, line, divider); + + elseif testaaPop(line) + poimiNimi = 1; + + elseif line ~= -1 + ninds = ninds+1; + data = addAlleles(data, ninds, line, divider); + end +end + +data = data(1:ninds*2,:); +popnames = popnames(1:nimienLkm,:); +fclose(fid); + + +%------------------------------------------------------- + +function nimi = lueNimi(line) +%Palauttaa line:n alusta sen osan, joka on ennen pilkkua. +n = 1; +merkki = line(n); +nimi = ''; +while ~isequal(merkki,',') + nimi = [nimi merkki]; + n = n+1; + merkki = line(n); +end + +%------------------------------------------------------- + +function df = selvitaDigitFormat(line) +% line on ensimmäinen pop-sanan jälkeinen rivi +% Genepop-formaatissa olevasta datasta. funktio selvittää +% rivin muodon perusteella, ovatko datan alleelit annettu +% 2 vai 3 numeron avulla. + +n = 1; +merkki = line(n); +while ~isequal(merkki,',') + n = n+1; + merkki = line(n); +end + +while ~any(merkki == '0123456789'); + n = n+1; + merkki = line(n); +end +numeroja = 0; +while any(merkki == '0123456789'); + numeroja = numeroja+1; + n = n+1; + merkki = line(n); +end + +df = numeroja/2; + + +%------------------------------------------------------ + + +function count = rivinSisaltamienMjonojenLkm(line) +% Palauttaa line:n sisältämien mjonojen lukumäärän. +% Mjonojen välissä täytyy olla välilyönti. +count = 0; +pit = length(line); +tila = 0; %0, jos odotetaan välilyöntejä, 1 jos odotetaan muita merkkejä +for i=1:pit + merkki = line(i); + if (isspace(merkki) & tila==0) + %Ei tehdä mitään. + elseif (isspace(merkki) & tila==1) + tila = 0; + elseif (~isspace(merkki) & tila==0) + tila = 1; + count = count+1; + elseif (~isspace(merkki) & tila==1) + %Ei tehdä mitään + end +end + +%------------------------------------------------------- + +function pal = testaaPop(rivi) +% pal=1, mikäli rivi alkaa jollain seuraavista +% kirjainyhdistelmistä: Pop, pop, POP. Kaikissa muissa +% tapauksissa pal=0. + +if length(rivi)<3 + pal = 0; + return +end +if (all(rivi(1:3)=='Pop') | ... + all(rivi(1:3)=='pop') | ... + all(rivi(1:3)=='POP')) + pal = 1; + return +else + pal = 0; + return +end + +%----------------------------------------------------------------------------------- + + +function [sumcounts, counts, logml] = ... + initialCounts(partition, data, npops, rowsFromInd, noalle) + +nloci=size(data,2); +ninds = size(data,1)/rowsFromInd; + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +initializeGammaln(ninds, rowsFromInd, max(noalle)); + +logml = computeLogml(counts, sumcounts, noalle, data, rowsFromInd); + + +%----------------------------------------------------------------------- + + +function logml=computeLogml(counts, sumcounts, noalle, data, rowsFromInd) +nloci = size(counts,2); +npops = size(counts,3); +adjnoalle = zeros(max(noalle),nloci); +for j=1:nloci + adjnoalle(1:noalle(j),j)=noalle(j); + if (noalle(j)0)); +end diff --git a/matlab/admixture/learn_simple_partition.m b/matlab/admixture/learn_simple_partition.m new file mode 100644 index 0000000..e9b1bed --- /dev/null +++ b/matlab/admixture/learn_simple_partition.m @@ -0,0 +1,70 @@ +function part = learn_simple_partition(ordered_points, fii) +% Goes through all the ways to divide the points into two or three groups. +% Chooses the partition which obtains highest logml. + +npoints = length(ordered_points); + +% One cluster: +val = calculatePopLogml(ordered_points,fii); +bestValue = val; +best_type = 'single'; + +% Two clusters: +for i=1:npoints-1 + % The right endpoint of the first cluster. + val_1 = calculatePopLogml(ordered_points(1:i),fii); + val_2 = calculatePopLogml(ordered_points(i+1:end),fii); + total = val_1 + val_2; + if total>bestValue + bestValue = total; + best_type = 'double'; + best_i = i; + end +end + +% Three clusters: +for i=1:npoints-2 + for j=i+1:npoints-1 + val_1 = calculatePopLogml(ordered_points(1:i),fii); + val_2 = calculatePopLogml(ordered_points(i+1:j),fii); + val_3 = calculatePopLogml(ordered_points(j+1:end),fii); + total = val_1 + val_2 + val_3; + if total>bestValue + bestValue = total; + best_type = 'triple'; + best_i = i; + best_j = j; + end + end +end + +part = zeros(npoints,1); + +switch best_type + case 'single' + part = ones(npoints,1); + case 'double' + part(1:best_i) = 1; + part(best_i+1:end) = 2; + case 'triple' + part(1:best_i) = 1; + part(best_i+1:best_j) = 2; + part(best_j+1:end) = 3; +end + + +%------------------------------------------ + + +function val = calculatePopLogml(points,fii) +% Calculates fuzzy (log) marginal likelihood for a population of real +% values using estimate "fii" for the dispersion value, and Jeffreys prior +% for the mean parameter. + +n = length(points); +fuzzy_ones = sum(points); +fuzzy_zeros = n-fuzzy_ones; + +val = gammaln(1) - gammaln(1 + n/fii) ... + + gammaln(0.5 + fuzzy_ones/fii) + gammaln(0.5 + fuzzy_zeros/fii) ... + - gammaln(0.5) - gammaln(0.5); \ No newline at end of file diff --git a/matlab/admixture/linkage_admix.m b/matlab/admixture/linkage_admix.m new file mode 100644 index 0000000..8fb4eed --- /dev/null +++ b/matlab/admixture/linkage_admix.m @@ -0,0 +1,1034 @@ +function linkage_admix(tietue) + +global COUNTS; global PARTITION; global SUMCOUNTS; +clearGlobalVars; + +PARTITION = tietue.PARTITION; +COUNTS = tietue.COUNTS; +SUMCOUNTS = tietue.SUMCOUNTS; +rowsFromInd = tietue.rowsFromInd; +data = double(tietue.data); +npops = tietue.npops; +noalle = tietue.noalle; +switch tietue.mixtureType + case 'linear_mix' + linkage_model = 'linear'; + case 'codon_mix' + linkage_model = 'codon'; +end +if isfield(tietue,'gene_lengths') + gene_lengths = tietue.gene_lengths; +else + [filename, pathname] = uigetfile('*.txt', 'Load file with lengths of the genes (same order as in data).'); + gene_lengths = load([pathname filename]); +end + +if length(unique(PARTITION(find(PARTITION>0))))==1 + disp('Only one population in the input file'); + disp('No admixture detected'); + return +end + +answers = inputdlg({['Input the minimum size of a population that will'... + ' be taken into account when admixture is estimated.']},... + 'Input minimum population size',1,{'5'}); +if isempty(answers), return; end +alaRaja = str2double(answers{1,1}); +[npops] = poistaLiianPienet(npops, rowsFromInd, alaRaja); + +nloci = size(COUNTS,2); +ninds = size(data,1)/rowsFromInd; + +answers = inputdlg({'Input number of iterations'},'Input',1,{'50'}); +if isempty(answers), return; end +iterationCount = str2double(answers{1,1}); + +answers = inputdlg({'Input number of reference individuals from each population'},'Input',1,{'50'}); +if isempty(answers), nrefIndsInPop = 50; +else nrefIndsInPop = str2double(answers{1,1}); +end + +answers = inputdlg({'Input number of iterations for reference individuals'},'Input',1,{'10'}); +if isempty(answers), return; end +iterationCountRef = str2double(answers{1,1}); + +[cliq_data, sep_data, cliq_counts, component_mat] = createCliqData(data, gene_lengths, noalle, ... + linkage_model, rowsFromInd); + +% Repeat: simulate clique frequencies and estimate proportions. Save the +% average proportions in "proportionsIt". + +proportionsIt = zeros(ninds,npops); +for iterationNum = 1:iterationCount + disp(['Iter: ' num2str(iterationNum)]); + %allfreqs = simulateAllFreqs(noalle); + [cliq_freqs, sep_freqs] = simulateCliqFreqs(cliq_counts, noalle, component_mat, gene_lengths, linkage_model); + for ind=1:ninds + + %omaFreqs = computePersonalAllFreqs(ind, data, allfreqs, rowsFromInd); + [ownCliqFreqs, ownSepFreqs] = computePersonalCliqueFreqs(ind, ... + cliq_data, cliq_freqs, sep_data, sep_freqs, rowsFromInd, gene_lengths, linkage_model); + osuusTaulu = zeros(1,npops); + if PARTITION(ind)==0 + % Outlier individual + elseif PARTITION(ind)~=0 + if PARTITION(ind)>0 + osuusTaulu(PARTITION(ind)) = 1; + else + % Individuals who are not assigned to any cluster. + arvot = zeros(1,npops); + for q=1:npops + osuusTaulu = zeros(1,npops); + osuusTaulu(q) = 1; + arvot(q) = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, osuusTaulu); + end + [iso_arvo, isoimman_indeksi] = max(arvot); + osuusTaulu = zeros(1,npops); + osuusTaulu(isoimman_indeksi) = 1; + PARTITION(ind)=isoimman_indeksi; + end + logml = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, osuusTaulu); + + for osuus = [0.5 0.25 0.05 0.01] + [osuusTaulu, logml] = searchBest(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml); + end + end + proportionsIt(ind,:) = proportionsIt(ind,:).*(iterationNum-1) + osuusTaulu; + proportionsIt(ind,:) = proportionsIt(ind,:)./iterationNum; + end +end + +disp(['Creating ' num2str(nrefIndsInPop) ' reference individuals from ']); +disp('each population.'); + +%allfreqs = simulateAllFreqs(noalle); % Simuloidaan alleelifrekvenssisetti +%allfreqs = computeAllFreqs2(noalle); % Koitetaan tällaista. +%refData = simulateIndividuals(nrefIndsInPop,rowsFromInd,allfreqs); + +exp_cliq_freqs = ... + computeExpectedFreqs(cliq_counts, noalle, component_mat, gene_lengths, linkage_model); +[ref_cliq_data, ref_sep_data] = ... + simulateLinkageIndividuals(nrefIndsInPop, rowsFromInd, exp_cliq_freqs, ... + gene_lengths, noalle, component_mat, linkage_model); +nrefInds = npops*nrefIndsInPop; + +disp(['Analysing the reference individuals in ' num2str(iterationCountRef) ' iterations.']); +refProportions = zeros(nrefInds,npops); +for iter = 1:iterationCountRef + disp(['Iter: ' num2str(iter)]); + %allfreqs = simulateAllFreqs(noalle); + [cliq_freqs, sep_freqs] = simulateCliqFreqs(cliq_counts, noalle, component_mat, gene_lengths, linkage_model); + for ind = 1:nrefInds + %omaFreqs = computePersonalAllFreqs(ind, refData, allfreqs, rowsFromInd); + [ownCliqFreqs, ownSepFreqs] = computePersonalCliqueFreqs(ind, ... + ref_cliq_data, cliq_freqs, ref_sep_data, sep_freqs, rowsFromInd, gene_lengths, linkage_model); + osuusTaulu = zeros(1,npops); + pop = ceil(ind/nrefIndsInPop); + osuusTaulu(pop)=1; + %logml = computeIndLogml(omaFreqs, osuusTaulu); + logml = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, osuusTaulu); + for osuus = [0.5 0.25 0.05 0.01] + %[osuusTaulu, logml] = searchBest(osuus, osuusTaulu, omaFreqs, logml); + [osuusTaulu, logml] = searchBest(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml); + end + refProportions(ind,:) = refProportions(ind,:).*(iter-1) + osuusTaulu; + refProportions(ind,:) = refProportions(ind,:)./iter; + end +end +refTaulu = zeros(npops,100); +for ind = 1:nrefInds + pop = ceil(ind/nrefIndsInPop); + omanOsuus = refProportions(ind,pop); + if round(omanOsuus*100)==0 + omanOsuus = 0.01; + end + if abs(omanOsuus)<1e-5 + omanOsuus = 0.01; + end + refTaulu(pop, round(omanOsuus*100)) = refTaulu(pop, round(omanOsuus*100))+1; +end + +% Rounding: +proportionsIt = proportionsIt.*100; proportionsIt = round(proportionsIt); +proportionsIt = proportionsIt./100; +for ind = 1:ninds + % if sum not equal to one, fix the largest part. + if (PARTITION(ind)>0) && (sum(proportionsIt(ind,:)) ~= 1) + [isoin,indeksi] = max(proportionsIt(ind,:)); + erotus = sum(proportionsIt(ind,:))-1; + proportionsIt(ind,indeksi) = isoin-erotus; + end +end + +% "p-value" for admixture +uskottavuus = zeros(ninds,1); +for ind = 1:ninds + pop = PARTITION(ind); + if pop==0 % an outlier + uskottavuus(ind)=1; + else + omanOsuus = proportionsIt(ind,pop); + if abs(omanOsuus)<1e-5 + omanOsuus = 0.01; + end + if round(omanOsuus*100)==0 + omanOsuus = 0.01; + end + refPienempia = sum(refTaulu(pop, 1:round(100*omanOsuus))); + uskottavuus(ind) = refPienempia / nrefIndsInPop; + end +end + +tulostaAdmixtureTiedot(proportionsIt, uskottavuus, alaRaja, iterationCount); + +%viewPartition(proportionsIt, popnames); + +[filename, pathname] = uiputfile('*.mat','Save admixture results as'); +if (filename == 0) & (pathname == 0) + % Cancel was pressed + return +end + +% copy 'baps4_output.baps' into the text file with the same name. +if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',[pathname filename '.txt']) + delete('baps4_output.baps') +end + +tietue.proportionsIt = proportionsIt; +tietue.pvalue = uskottavuus; % Added by Jing +tietue.admixnpops = npops; +tietue.mixtureType = 'admix'; % added by jing on 09.09.2008 +% save([pathname filename], 'tietue'); +save([pathname filename], 'tietue','-v7.3'); % added by Lu Cheng, 08.06.2012 + + +%---------------------------------------------------------------------------- + + +function [npops] = poistaLiianPienet(npops, rowsFromInd, alaraja) +% Muokkaa tulokset muotoon, jossa outlier yksilät on +% poistettu. Tarkalleen ottaen poistaa ne populaatiot, +% joissa on vähemmän kuin 'alaraja':n verran yksiläit? + +global PARTITION; +global COUNTS; +global SUMCOUNTS; + +popSize=zeros(1,npops); +for i=1:npops + popSize(i)=length(find(PARTITION==i)); +end +miniPops = find(popSize0))); +for n=1:length(korit) + kori = korit(n); + yksilot = find(PARTITION==kori); + PARTITION(yksilot) = n; +end +COUNTS(:,:,miniPops) = []; +SUMCOUNTS(miniPops,:) = []; + +npops = npops-length(miniPops); + +%------------------------------------------------------------------------ + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; + +%-------------------------------------------------------- + + +function allFreqs = computeAllFreqs2(noalle) +% Lisää a priori jokaista alleelia +% joka populaation joka lokukseen j 1/noalle(j) verran. + +global COUNTS; +global SUMCOUNTS; + +max_noalle = size(COUNTS,1); +nloci = size(COUNTS,2); +npops = size(COUNTS,3); + +sumCounts = SUMCOUNTS+ones(size(SUMCOUNTS)); +sumCounts = reshape(sumCounts', [1, nloci, npops]); +sumCounts = repmat(sumCounts, [max_noalle, 1 1]); + +prioriAlleelit = zeros(max_noalle,nloci); +for j=1:nloci + prioriAlleelit(1:noalle(j),j) = 1/noalle(j); +end +prioriAlleelit = repmat(prioriAlleelit, [1,1,npops]); +counts = COUNTS + prioriAlleelit; +allFreqs = counts./sumCounts; + +%-------------------------------------------------------------------------- + + +function refData = simulateIndividuals(n,rowsFromInd,allfreqs) +% simuloidaan n yksilää jokaisesta populaatiosta. ( + +npops = size(allfreqs,3); +nloci = size(allfreqs,2); +ninds = n*npops; + +refData = zeros(ninds*rowsFromInd,nloci); +counter = 1; % Pitää kirjaa mille riville seuraavaksi simuloidaan. + +for ind = 1:ninds + pop = ceil(ind/n); + for loc = 1:nloci + for k=0:rowsFromInd-1 + refData(counter+k,loc) = simuloiAlleeli(allfreqs,pop,loc); + end + end + counter = counter+rowsFromInd; +end + +function all = simuloiAlleeli(allfreqs,pop,loc) +% Simuloi populaation pop lokukseen loc alleelin. +freqs = allfreqs(:,loc,pop); +cumsumma = cumsum(freqs); +arvo = rand; +isommat = find(cumsumma>arvo); +all = min(isommat); + +%--------------------------------------------------------------------------- + + +function loggis = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, proportions) +% Calculates the likelihood when the origins are defined by "proportions". + +aux = proportions * ownCliqFreqs; +aux = log(aux); +loggis = sum(aux); + +clear aux; + +aux2 = proportions * ownSepFreqs; +aux2 = log(aux2); +loggis = loggis - sum(aux2); + + +%-------------------------------------------------------------------------- + + +function osuusTaulu = suoritaMuutos(osuusTaulu, osuus, indeksi) +% Päivittää osuusTaulun muutoksen jälkeen. + +global COUNTS; +npops = size(COUNTS,3); + +i1 = rem(indeksi,npops); +if i1==0, i1 = npops; end; +i2 = ceil(indeksi / npops); + +osuusTaulu(i1) = osuusTaulu(i1)-osuus; +osuusTaulu(i2) = osuusTaulu(i2)+osuus; + + +%------------------------------------------------------------------------- + + +function [osuusTaulu, logml] = searchBest(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml) + +ready = 0; +while ready ~= 1 + muutokset = calcChanges(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml); + [maxMuutos, indeksi] = max(muutokset(1:end)); + if maxMuutos>0 + osuusTaulu = suoritaMuutos(osuusTaulu, osuus, indeksi); + logml = logml + maxMuutos; + else + ready = 1; + end +end + + + +%--------------------------------------------------------------------------- + + +function muutokset = calcChanges(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml) +% Palauttaa npops*npops taulun, jonka alkio (i,j) kertoo, mik?on +% muutos logml:ss? mikäli populaatiosta i siirretään osuuden verran +% todennäkäisyysmassaa populaatioon j. Mikäli populaatiossa i ei ole +% mitään siirrettävää, on vastaavassa kohdassa rivi nollia. + +global COUNTS; +npops = size(COUNTS,3); + +notEmpty = find(osuusTaulu>0.005); +muutokset = zeros(npops); +empties = ~notEmpty; + +for i1=notEmpty + + osuusTaulu(i1) = osuusTaulu(i1)-osuus; + + for i2 = [1:i1-1 i1+1:npops] + osuusTaulu(i2) = osuusTaulu(i2)+osuus; + loggis = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, osuusTaulu); + muutokset(i1,i2) = loggis-logml; + osuusTaulu(i2) = osuusTaulu(i2)-osuus; + end + + osuusTaulu(i1) = osuusTaulu(i1)+osuus; +end + + +%--------------------------------------------------------------- + + +function dispLine +disp('---------------------------------------------------'); + + +%-------------------------------------------------------------------------- + + +function tulostaAdmixtureTiedot(proportions, uskottavuus, alaRaja, niter) +h0 = findobj('Tag','filename1_text'); +inputf = get(h0,'String'); +h0 = findobj('Tag','filename2_text'); +outf = get(h0,'String'); clear h0; + +if length(outf)>0 + fid = fopen(outf,'a'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +ninds = length(uskottavuus); +npops = size(proportions,2); +disp(' '); +dispLine; +disp('RESULTS OF ADMIXTURE ANALYSIS BASED'); +disp('ON MIXTURE CLUSTERING OF INDIVIDUALS'); +disp(['Data file: ' inputf]); +disp(['Number of individuals: ' num2str(ninds)]); +disp(['Results based on ' num2str(niter) ' simulations from posterior allele frequencies.']); +disp(' '); +if fid ~= -1 + fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['--------------------------------------------']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['RESULTS OF ADMIXTURE ANALYSIS BASED']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['ON MIXTURE CLUSTERING OF INDIVIDUALS']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Data file: ' inputf]); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Number of individuals: ' num2str(ninds)]); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Results based on ' num2str(niter) ' simulations from posterior allele frequencies.']); fprintf(fid, '\n'); + fprintf(fid, '\n'); +end + +ekaRivi = blanks(6); +for pop = 1:npops + ekaRivi = [ekaRivi blanks(3-floor(log10(pop))) num2str(pop) blanks(2)]; +end +ekaRivi = [ekaRivi blanks(1) 'p']; % Added on 29.08.06 +disp(ekaRivi); +for ind = 1:ninds + rivi = [num2str(ind) ':' blanks(4-floor(log10(ind)))]; + if any(proportions(ind,:)>0) + for pop = 1:npops-1 + rivi = [rivi proportion2str(proportions(ind,pop)) blanks(2)]; + end + rivi = [rivi proportion2str(proportions(ind,npops)) ': ']; + rivi = [rivi ownNum2Str(uskottavuus(ind))]; + end + disp(rivi); + if fid ~= -1 + fprintf(fid,'%s \n',[rivi]); fprintf(fid,'\n'); + end +end +if fid ~= -1 + fclose(fid); +else + diary off +end + +%------------------------------------------------------ + +function str = proportion2str(prob) +%prob belongs to [0.00, 0.01, ... ,1]. +%str is a 4-mark presentation of proportion. + +if abs(prob)<1e-3 + str = '0.00'; +elseif abs(prob-1) < 1e-3; + str = '1.00'; +else + prob = round(100*prob); + if prob<10 + str = ['0.0' num2str(prob)]; + else + str = ['0.' num2str(prob)]; + end; +end; + +%------------------------------------------------------- + +function g=randga(a,b) +flag = 0; +if a>1 +c1 = a-1; c2 = (a-(1/(6*a)))/c1; c3 = 2/c1; c4 = c3+2; c5 = 1/sqrt(a); +U1=-1; +while flag == 0, +if a<=2.5, +U1=rand;U2=rand; +else +while ~(U1>0 & U1<1), +U1=rand;U2=rand; +U1 = U2 + c5*(1-1.86*U1); +end %while +end %if +W = c2*U2/U1; +if c3*U1+W+(1/W)<=c4, +flag = 1; +g = c1*W/b; +elseif c3*log(U1)-log(W)+W<1, +flag = 1; +g = c1*W/b; +else +U1=-1; +end %if +end %while flag +elseif a==1 +g=sum(-(1/b)*log(rand(a,1))); +else +while flag == 0, +U = rand(2,1); +if U(1)>exp(1)/(a+exp(1)), +g = -log(((a+exp(1))*(1-U(1)))/(a*exp(1))); +if U(2)<=g^(a-1), +flag = 1; +end %if +else +g = ((a+exp(1))*U(1)/((exp(1))^(1/a))); +if U(2)<=exp(-g), +flag = 1; +end %if +end %if +end %while flag +g=g/b; +end %if; + + +%------------------------------------------------- + +function svar=randdir(counts,nc) +% Käyttäesim randdir([10;30;60],3) + +svar=zeros(nc,1); +for i=1:nc + svar(i,1)=randga(counts(i,1),1); +end +svar=svar/sum(svar); + +%-------------------------------------------------- + +function waitALittle +A = rand(500); +gammaln(A); + + +%-------------------------------------------------- + +function [cliq_data, sep_data, cliq_counts, component_mat] = ... + createCliqData(data, gene_lengths, noalle, linkage_model, ... + rowsFromInd) +% cliq_data: cell array, each cell corresponds to one gene. Element (i,j) +% in cell k is the code of the allele combination in the j:th clique in +% gene k, for individual i. + +% sep_data: like cliq_data. i:th separator separates cliques i and i+1. + +% cliq_counts: cell array, each cell corresponds to one gene. Each cell is +% a 3-dimensional array, where the element (i,j,k) is the observed count of +% allele combination i, in clique j, in population k. + +% The coding of the allele combinations: If a clique of 3 sites has +% noalle values 3,2,4, then the allele combinations are given numbers in +% lexicographic order: 111, 112, 113, 114, 121, 122, ..., 324. + +%------------------------------------------------------------------------ + +% cliq_data on cell-array, jossa kukin solu vastaa yht?geeni? Alkio (i,j) +% solussa k merkitsee sen alleelikombinaation koodia, joka yksiläll?i +% havaitaan geenin k klikiss?numero j. + +% cliq_counts on cell-array, jossa myäs kukin solu vastaa yht?geeni? +% Kukin solu on kolmiulotteinen taulukko, jonka alkio (i,j,k) on +% populaatiossa k, ko geenin j:nness?klikiss?havaitun alleelikombinaation +% i lukumäär? + +% Alleelikombinaatioiden koodaus: Jos kolmen position klikiss?on (koko +% datassa) noalle:t 3,2,4, (eli ekassa positiossa alleelit 1-3, tokassa 1-2 +% ja kolmannessa alleelit 1-4), niin alleelikombinaatiot numeroidaan +% leksikograafisessa järjestyksess? 111, 112, 113, 114, 121, 122, ..., +% 324. + +%----------------------------------------------------------------------- + +global PARTITION; + +if sum(gene_lengths) ~= size(data,2) + disp('Error 155'); +end +if ~isa(data,'double') + data = double(data); % Required in matlab 6 +end + +ninds = size(data,1); +n_genes = length(gene_lengths); +if strcmp(linkage_model,'codon') + n_cliques = gene_lengths-2; % Number of cliques in each gene +else + n_cliques = gene_lengths-1; % Use linear model +end +max_noalle = zeros(n_genes,1); % Maximum "clique noalle" in each gene. + +component_mat = zeros(n_genes, max(gene_lengths)); +cum_length = cumsum(gene_lengths); +component_mat(1,1:gene_lengths(1))=1:gene_lengths(1); +for i = 2:n_genes + component_mat(i,1:gene_lengths(i)) = cum_length(i-1)+1:cum_length(i); +end + +for i = 1:n_genes + % What is the largest number of different values that are observed for + % some clique in this gene: + number = 0; + if n_cliques(i)<1 + % the gene is shorter than a normal clique. + if gene_lengths(i)==2 + % linkage_model must be 'codon' to end up here.. + positions = component_mat(i, [1 2]); + number = prod(noalle(positions)); + else + % gene_lengths(i) == 1 + positions = component_mat(i,1); + number = noalle(positions); + end + else + for j = 1:n_cliques(i) + if strcmp(linkage_model,'codon'), positions = component_mat(i , j:j+2); + else positions = component_mat(i, j:j+1); + end + + cand = prod(noalle(positions)); + if cand>number + number=cand; + end + end + end + max_noalle(i) = number; +end + +cliq_data = cell(n_genes,1); % An array for each gene. +% (i,j) is the combination which individual i has in clique j (in haploid case..). + +for i = 1:n_genes + if n_cliques(i)<1 + cliq_data{i} = zeros(ninds, 1); + if gene_lengths(i)==2 + % linkage_model must be 'codon' to end up here.. + positions = component_mat(i, [1 2]); + rows = data(:,positions); + observations = (rows(:,1)-1) * noalle(positions(2)) + rows(:,2); + else + positions = component_mat(i,1); + rows = data(:,positions); + observations = rows; + end + cliq_data{i}(:,1) = observations; + else + cliq_data{i} = zeros(ninds, n_cliques(i)); + for j = 1:n_cliques(i) + if strcmp(linkage_model,'codon') + positions = component_mat(i,j:j+2); + rows = data(:, positions); + observations = (rows(:,1)-1) * prod(noalle(positions(2:3))) + ... + (rows(:,2)-1) * noalle(positions(3)) + rows(:,3); + else + positions = component_mat(i,j:j+1); + rows = data(:,positions); + observations = (rows(:,1)-1) * noalle(positions(2)) + rows(:,2); + end + cliq_data{i}(:,j) = observations; + end + end +end + +cliq_counts = cell(n_genes,1); +% (i,j,k) is the count of combination i, in clique j, in population k. + +npops = length(unique(PARTITION)); +for i = 1:n_genes + cliq_counts{i} = zeros(max_noalle(i), max(1,n_cliques(i)), npops); + for j = 1:npops + partition = repmat(PARTITION', [rowsFromInd 1]); + partition = partition(:); % Partition for rows in the data (instead of individuals). + inds_now = find(partition==j); + ninds_now = length(inds_now); + data_now = cliq_data{i}(inds_now,:); + + for k = 1:max(n_cliques(i),1) + apu = zeros(ninds_now, max_noalle(i)); + apu(sub2ind([ninds_now max_noalle(i)],... + (1:ninds_now)', data_now(:,k)))=1; + cliq_counts{i}(:, k, j) = (sum(apu,1))'; + end + end +end + +sep_data = cell(n_genes,1); +n_separators = n_cliques-1; +for i = 1:n_genes + sep_data{i} = zeros(ninds, n_separators(i)); + for j = 1:n_separators(i) + if strcmp(linkage_model, 'codon') + positions = component_mat(i,j+1:j+2); + rows = data(:, positions); + observations = (rows(:,1)-1) * noalle(positions(2)) + rows(:,2); + else + positions = component_mat(i,j+1); + rows = data(:,positions); + observations = rows; + end + sep_data{i}(:,j) = observations; + end +end + + +%------------------------------------------------------ + + +function [cliq_freqs, sep_freqs] = simulateCliqFreqs(cliq_counts, noalle, component_mat, ... + gene_lengths, linkage_model) + +% cliq_freqs: cell-array. Element (i,j,k) in cell m is the frequence of +% combination i, in clique j of the m:th gene, in population k. + +% sep_freqs: like cliq_freqs, but for the separators. + +%------------------------------------------------------------------------ + +% cliq_freqs: cell-array, jossa on vastaavat dimensiot kuin cliq_counts:issa. +% solun m alkio (i,j,k) on geenin m, klikin j, kombinaation i, frekvenssi +% populaatiossa k. + +% sep_freqs: cell-array, kuten cl_freqs, mutta separaattoreille. + +%------------------------------------------------------------------------- + +global PARTITION; + +n_genes = length(cliq_counts); +if strcmp(linkage_model,'codon') + n_cliques = gene_lengths-2; % Number of cliques in each gene +else + n_cliques = gene_lengths-1; % Use linear model +end +npops = length(unique(PARTITION)); + +cliq_freqs = cell(n_genes,1); +sep_freqs = cell(n_genes,1); + +for i=1:n_genes + + cliq_freqs{i} = zeros(size(cliq_counts{i})); + + positions = component_mat(i,1:gene_lengths(i)); + + if n_cliques(i)<1 + % the gene is shorter than a normal clique. + if gene_lengths(i)==2 + % linkage_model must be 'codon' to end up here.. + cliq_noalle = noalle(positions(1)) .* noalle(positions(2)); + sep_noalle = []; + else + % gene_lengths(i) == 1 + cliq_noalle = noalle(positions(1)); + sep_noalle = []; + end + sep_freqs{i} = []; + else + if strcmp(linkage_model, 'codon') + cliq_noalle = noalle(positions(1:end-2)) .* noalle(positions(2:end-1)) .* ... + noalle(positions(3:end)); + sep_noalle = noalle(positions(2:end-2)) .* noalle(positions(3:end-1)); + else + cliq_noalle = noalle(positions(1:end-1)) .* noalle(positions(2:end)); + sep_noalle = noalle(positions(2:end-1)); + end + sep_freqs{i} = zeros(max(sep_noalle), n_cliques(i)-1, npops); + end + + % First clique: + prior = (1 / cliq_noalle(1)) * ones(cliq_noalle(1),1); + counts_now = repmat(prior, [1 1 npops]) + cliq_counts{i}(1:cliq_noalle(1),1,:); + for k = 1:npops + simul = randdir(counts_now(:,1,k), cliq_noalle(1)); + cliq_freqs{i}(1:cliq_noalle(1),1,k) = simul; + end + + for j=2:n_cliques(i) + % Obtain freqs for j-1:th separator by marginalization from j-1:th + % clique, and draw values for the frequencies of the j:th clique: + + aux = cliq_freqs{i}(1:cliq_noalle(j-1), j-1, :); % Freqs of the previous clique + aux = reshape(aux, [sep_noalle(j-1), noalle(positions(j-1)), npops]); + + % Freqs for separator by marginalization from the previous clique: + sep_freqs{i}(1:sep_noalle(j-1),j-1,:) = sum(aux,2); + + prior = (1 / cliq_noalle(j)) * ones(cliq_noalle(j),1); + counts_now = repmat(prior, [1 1 npops]) + cliq_counts{i}(1:cliq_noalle(j),j,:); + for k = 1:npops + % Simulate conditional frequencies: + for m = 1:sep_noalle(j-1) + if strcmp(linkage_model, 'codon') + values = (m-1)*noalle(positions(j+2))+1:m*noalle(positions(j+2)); + else + values = (m-1)*noalle(positions(j+1))+1:m*noalle(positions(j+1)); + end + simul = randdir(counts_now(values,1,k), length(values)); + cliq_freqs{i}(values,j,k) = simul * sep_freqs{i}(m,j-1,k); % MIETI TARKKAAN! + end + end + end +end + + +%-------------------------------------------------------------------- + + +function [ownCliqFreqs, ownSepFreqs] = computePersonalCliqueFreqs(... + ind, cl_data, cl_freqs, sep_data, sep_freqs, rowsFromInd, ... + gene_lengths, linkage_model) + +% ownCliqFreqs is (npops * (n_cliques*rowsFromInd)) table, where each column +% contains the frequencies of the corresponding clique_combination, in +% different populations. + +% ownSepFreqs is (npops * (n_seps*rowsFromInd)) table, like ownCliqFreqs. + +n_genes = length(gene_lengths); +if strcmp(linkage_model,'codon') + n_cliques = gene_lengths-2; + n_cliques = max([n_cliques ones(n_genes,1)], [], 2); % for genes shorter than clique +else + n_cliques = gene_lengths-1; % Use linear model. + n_cliques = max([n_cliques ones(n_genes,1)], [], 2); +end + +total_n_cliques = sum(n_cliques); +npops = size(cl_freqs{1},3); + +ownCliqFreqsXX = zeros(1, total_n_cliques*rowsFromInd, npops); + +pointer = 1; +for i = 1:n_genes + ind_data = cl_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd , :); + for j = 1:n_cliques(i) % MUUTA! + for k = 1:rowsFromInd + code = ind_data(k,j); + ownCliqFreqsXX(1,pointer,:) = cl_freqs{i}(code,j,:); + pointer = pointer+1; + end + end +end + +ownCliqFreqs = (squeeze(ownCliqFreqsXX))'; + +n_separators = n_cliques-1; +total_n_separators = sum(n_separators); + +ownSepFreqsXX = zeros(1, total_n_separators*rowsFromInd, npops); + +pointer = 1; +for i = 1:n_genes + ind_data = sep_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd , :); + for j = 1:n_separators(i) + for k = 1:rowsFromInd + code = ind_data(k,j); + ownSepFreqsXX(1,pointer,:) = sep_freqs{i}(code,j,:); + pointer = pointer+1; + end + end +end + +if (total_n_separators*rowsFromInd)==1 + ownSepFreqs = (squeeze(ownSepFreqsXX)); +else + ownSepFreqs = (squeeze(ownSepFreqsXX))'; +end + + +%------------------------------------------------------------------------- + + +function exp_cliq_freqs = computeExpectedFreqs(cliq_counts, ... + noalle, component_mat, gene_lengths, linkage_model) + +% Returns the expected values for the clique and separator frequencies in +% different populations. + +% exp_cliq_freqs: cell-array. Element (i,j,k) in cell m is the expected +% frequence of combination i, in clique j of the m:th gene, in +% population k. + +n_genes = length(gene_lengths); + +if strcmp(linkage_model, 'codon') + n_cliques = gene_lengths-2; +else + n_cliques = gene_lengths-1; % Linear model +end + +npops = size(cliq_counts{1},3); +exp_cliq_freqs = cell(n_genes,1); + +for i = 1:n_genes + + exp_cliq_freqs{i} = zeros(size(cliq_counts{i})); + positions = component_mat(i,1:gene_lengths(i)); + + if n_cliques(i)<1 + % the gene is shorter than a normal clique. + if gene_lengths(i)==2 + % linkage_model must be 'codon' to end up here.. + cliq_noalle = noalle(positions(1)) .* noalle(positions(2)); + else + % gene_lengths(i) == 1 + cliq_noalle = noalle(positions(1)); + end + else + if strcmp(linkage_model, 'codon') + cliq_noalle = noalle(positions(1:end-2)) .* noalle(positions(2:end-1)) .* ... + noalle(positions(3:end)); + else + cliq_noalle = noalle(positions(1:end-1)) .* noalle(positions(2:end)); + end + end + + for j = 1:max(1, n_cliques(i)) + prior = (1 / cliq_noalle(j)) * ones(cliq_noalle(j),1); + counts_now = repmat(prior, [1 1 npops]) + cliq_counts{i}(1:cliq_noalle(j),j,:); + exp_cliq_freqs{i}(1:cliq_noalle(j),j,:) = ... + counts_now ./ repmat(sum(counts_now,1), [cliq_noalle(j) 1 1]); + end +end + + +%---------------------------------------------------------- + + +function [ref_cliq_data, ref_sep_data] = ... + simulateLinkageIndividuals(n, rowsFromInd, exp_cliq_freqs, gene_lengths, ... + noalle, component_mat, linkage_model) + +% Simulates n individuals from each population using expected frequencies +% for cliques and separators. + +% ref_cliq_data: cell array, each cell corresponds to one gene. Elements +% ((i-1)*rowsFromInd+1:i*rowsFromInd, j) in cell k are the codes of the allele +% combinations in the j:th clique in gene k, for individual i. + +% ref_sep_data: like ref_cliq_data. i:th separator separates cliques i and i+1. + +n_genes = length(gene_lengths); +if strcmp(linkage_model,'codon') + n_cliques = gene_lengths-2; +else + n_cliques = gene_lengths-1; % Linear model +end +npops = size(exp_cliq_freqs{1},3); +ninds = n*npops; + +ref_cliq_data = cell(n_genes,1); +ref_sep_data = cell(n_genes,1); + +for i = 1:n_genes + ref_cliq_data{i} = zeros(ninds*rowsFromInd, max(n_cliques(i),1)); % Added: rowsFromInd + + positions = component_mat(i,1:gene_lengths(i)); + + if strcmp(linkage_model,'codon') + sep_noalle = noalle(positions(2:end-2)) .* noalle(positions(3:end-1)); + else + sep_noalle = noalle(positions(2:end-1)); + end + ref_sep_data{i} = zeros(ninds*rowsFromInd, n_cliques(i)-1); % Added: rowsFromInd + + for ind = 1:ninds + pop = ceil(ind/n); + + % First clique: + freqs = exp_cliq_freqs{i}(:,1,pop); + freqs = repmat(freqs, [1 rowsFromInd]); + codes = simulateCodes(freqs); + ref_cliq_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd, 1) = codes; + + for j = 2:n_cliques(i) + previous_cliq = ref_cliq_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd, j-1); + + % Value for j-1:th separator: + sep_codes = rem(previous_cliq, sep_noalle(j-1)); + sep_codes(find(sep_codes==0)) = sep_noalle(j-1); + ref_sep_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd, j-1) = sep_codes; + + % Value for j:th clique: + if strcmp(linkage_model,'codon') + freqs = zeros(noalle(positions(j+2)),rowsFromInd); + values = zeros(noalle(positions(j+2)),rowsFromInd); + for k = 1:rowsFromInd + values(:,k) = ((sep_codes(k)-1)*noalle(positions(j+2))+1 : sep_codes(k)*noalle(positions(j+2)))'; + freqs(:,k) = exp_cliq_freqs{i}(values(:,k), j, pop); + end + freqs = freqs ./ repmat(sum(freqs,1), [noalle(positions(j+2)) 1]); + else + freqs = zeros(noalle(positions(j+1)),rowsFromInd); + values = zeros(noalle(positions(j+1)),rowsFromInd); + for k = 1:rowsFromInd + values(:,k) = ((sep_codes(k)-1)*noalle(positions(j+1))+1 : sep_codes(k)*noalle(positions(j+1)))'; + freqs(:,k) = exp_cliq_freqs{i}(values(:,k), j, pop); + end + freqs = freqs ./ repmat(sum(freqs,1), [noalle(positions(j+1)) 1]); + end + codes = simulateCodes(freqs); + codes = values(sub2ind(size(values),codes,(1:rowsFromInd)')); + ref_cliq_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd, j) = codes; + end + end +end + +function codes = simulateCodes(freqs) +% Freqs is a table where each column is a distribution. The +% number of columns in freqs must be equal to rowsFromInd. +% A value is drawn from each distribution in different columns. The values +% are saved in codes, which is (rowsFromInd*1) table. + +[nrows, rowsFromInd] = size(freqs); +codes = nrows+1-sum(cumsum(freqs)>repmat(rand(1,rowsFromInd),[nrows 1]),1); +codes = codes'; \ No newline at end of file diff --git a/matlab/compileBaps6.m b/matlab/compileBaps6.m new file mode 100644 index 0000000..182d481 --- /dev/null +++ b/matlab/compileBaps6.m @@ -0,0 +1,6 @@ +if ~exist('BAPS_package','dir') + mkdir('BAPS_package'); +end + +mcc -m ./general/baps6.m -a ./admixture -a ./general -a ./graph -a ./independent -a ./linkage -a ./parallel -a ./spatial -d ./BAPS_package + diff --git a/matlab/general/baps4cbf.m b/matlab/general/baps4cbf.m new file mode 100644 index 0000000..3f67d5e --- /dev/null +++ b/matlab/general/baps4cbf.m @@ -0,0 +1,993 @@ +function baps4cbf(action) + +base = findobj('Tag','base_figure'); +%setWindowOnTop(base,'false') + +switch action + + case 'mix1_button' + greedyMix(-1); + + case 'mix2_button' + greedyPopMix; + + case 'trained_button' + trainedMix; + + case 'spatial_button' + spatialMixture; + showmethebaps; + + case 'spatial2_button' + spatialPopMixture; + showmethebaps; + + case 'linkage_button' + linkageMixture_speed; + %linkageMixture_ultraspeed; + showmethebaps; + + case 'admix1_button' + admix1(-1); + showmethebaps; + case 'admix2_button' + admix2; + + case 'compare_menu' + compare; + case 'compare_admix_menu' + compare_admix; + + case 'load_mixture_menu' + loadMixture; + + case 'load_admixture_menu' + loadAdmixture; + + case 'load_spatial_menu' + loadSpatial; + + case 'output_menu' + asetaOutputFile; + + case 'remove_outputfile_menu' + poistaOutputFile; + + case 'close_menu' + closeFile; + + case 'exit_menu' + h0 = findobj('Tag','base_figure'); delete(h0); + h0 = findobj('Tag','image_figure'); delete(h0); + + case 'loadfigure_menu' + loadFigure; + + case 'plot_coordinates_menu' + plotCoordinates; + + case 'partitio_menu' + viewPartitio; + + case 'admix_menu' + viewAdmixture; + + case 'likelihood_menu' + viewLoghood; + + case 'energy_menu' + viewEnergy; + + case 'geneflow_menu' + viewGeneflow; + + case 'voronoi_menu' + voronoiTessellation; + + case 'varmuus_menu' + localUncertainty; + + case 'changecolor_menu' + changeColor; + + case 'helpdoc' + openHelpDoc; + + case 'helponline' + openHelpHtml; + + case 'about' + openAboutWindow; + + case 'calculate_kl' + calculateDis('KL'); + + case 'calculate_nei' + calculateDis('Nei'); + + case 'calculate_hamming' + calculateDis('Hamming'); + + case 'upgma_menu' + viewPhylogeny('upgma'); + + case 'nj_menu' + viewPhylogeny('nj'); + + case 'mutationplot_menu' + mutationPlot(-1); + + case 'fixk_menu' + goToFixedK; + + case 'partitioncompare_menu' + goToPartitionCompare; +end + +return + +%-------------------------------------------------------------------------- +%KUVIEN LATAAMINEN +%-------------------------------------------------------------------------- + + +function loadFigure +waitALittle; +[filename,pathname] = uigetfile('*.fig','Load Figure'); +if (sum(filename)==0) || (sum(pathname)==0) + return; +end +fig_file_name = [pathname filename]; +open(fig_file_name); + +% ---------------------------- +% Old version +% ---------------------------- +% % Loads previously saved figure. +% +% waitALittle; +% [filename,pathname] = uigetfile('*.mat','Load Figure'); +% if (sum(filename)==0) || (sum(pathname)==0) +% return; +% end +% fig_file_name = [pathname filename]; +% %Figure file format must be *.mat. Ensure it: +% isMat = isTheFileMatFile(fig_file_name); +% if isMat == 0 +% msgbox(['Only figures that have been saved in BAPS can be loaded in BAPS. ' ... +% 'Those figures have extension ".mat".'],'Error', ... +% 'error'); +% return; +% end; +% struct_array = load([pathname filename]); +% if isfield(struct_array,'tiedot') %Matlab versio +% tiedot = struct_array.tiedot; +% if ~isfield(tiedot,'info') +% disp('Incorrect file format'); +% return +% end +% elseif isfield(struct_array,'info') %Mideva versio +% tiedot = struct_array; +% else +% disp('Incorrect file format'); +% return; +% end +% +% if isfield(tiedot, 'rows') +% rows = tiedot.rows; +% partition = tiedot.info; +% popnames = tiedot.popnames; +% viewPopMixPartition(partition, rows, popnames); +% else +% popnames = tiedot.popnames; +% info = tiedot.info; +% if (size(info,2)>1) +% %info on osuudet +% osuudet = info; +% viewPartition(osuudet,popnames); +% else +% info = round(info); +% partition = info; +% viewMixPartition(partition, popnames); +% end +% end + + +function isMat = isTheFileMatFile(filename) +%Checks that the file 'filename' is of the +%*.mat format. If so, isMat = 1. Otherwise, isMat = 0. + +len = length(filename); +if len < 5 + isMat = 0; return; +end; +ending = filename(end-3:end); +if isequal(ending,'.mat') + isMat = 1; +else + isMat = 0; +end; + +%-------------------------------------------------------------------- + +function asetaOutputFile +waitALittle; +[filename, pathname] = uiputfile('*.txt', 'Specify output file'); +if filename==0 + return +end + +h0 = findobj('Tag','filename2_text'); +set(h0,'String',[pathname filename]); + + +%--------------------------------------------------- + + +function poistaOutputFile +h0 = findobj('Tag','filename2_text'); +set(h0,'String',''); + + +%----------------------------------------------------- + +function plotCoordinates + +waitALittle; +[filename, pathname] = uigetfile('*.txt', 'Load Coordinate File'); +if filename==0 + return +end +X = load([pathname filename]); +if size(X,2)~=2 + disp('Incorrect file format'); + return +end + +waitALittle; +[filename, pathname] = uigetfile('*.mat', 'Load mixture clustering of individuals'); +%load([pathname filename],'c'); +struct_array = load([pathname filename]); +if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'PARTITION') || ~isfield(c,'rowsFromInd') + disp('Incorrect file format'); + return + end +elseif isfield(struct_array,'PARTITION') %Mideva versio + c = struct_array; + if ~isfield(c,'rowsFromInd') + disp('Incorrect file format'); + return + end +else + disp('Incorrect file format'); + return; +end +PARTITION = c.PARTITION; +if length(PARTITION) ~= size(X,1) + disp('Incorrect number of coordinate pairs.'); + return +end +% h0 = image_figure; +hold on; +for i=1:length(PARTITION) + if X(i,1)>=0 + plot(X(i,1),X(i,2),'Color',[.8 .8 .8]); + text(X(i,1),X(i,2),num2str(PARTITION(i))); + end +end +hold off; + + +%-------------------------------------------------------------------------- +%-------------------------------------------------------------------------- + +function voronoiTessellation +% Tekee tulostiedostosta voronoi tessellaation. +h0 = findobj('Tag','load_menu'); +c = get(h0,'UserData'); +% waitALittle; +% [filename, pathname] = uigetfile('*.mat', 'Load mixture clustering'); +% %load([pathname filename],'c'); +% struct_array = load([pathname filename]); +% if isfield(struct_array,'c') %Matlab versio +% c = struct_array.c; +% if ~isfield(c,'PARTITION') || ~isfield(c,'rowsFromInd') +% disp('Incorrect file format'); +% return +% end +% elseif isfield(struct_array,'PARTITION') %Mideva versio +% c = struct_array; +% if ~isfield(c,'rowsFromInd') +% disp('Incorrect file format'); +% return +% end +% else +% disp('Incorrect file format'); +% return; +% end +% +% if ~isfield(c, 'pointers') +% disp('Coordinate data missing from the result file'); +% return; +% end + +pointers = c.pointers; vorPoints = c.vorPoints; vorCells = c.vorCells; +coordinates = c.coordinates; names = c.names; + +if isequal(c.mixtureType, 'pop') || isequal(c.mixtureType, 'spatialPop') + PARTITION = c.groupPartition; +else + PARTITION = c.PARTITION; +end + +talle = questdlg(['Do you want names to be visible in the colored ' ... + 'Voronoi tessellation?'], 'Names visible?', 'Yes', 'No', 'Yes'); + +if isequal(talle,'No') + names = -1; +end +vorPlot(vorPoints, vorCells, PARTITION, pointers, coordinates, names); + +%-------------------------------------------------------------------------- + +function localUncertainty +% Tekee tulostiedostosta kolmiulotteisen lokaalia epävarmuutta kuvaavan +% kuvan. + +h0 = findobj('Tag','load_menu'); +c = get(h0,'UserData'); +% waitALittle; +% [filename, pathname] = uigetfile('*.mat', 'Load mixture clustering'); +% %load([pathname filename],'c'); +% struct_array = load([pathname filename]); +% if isfield(struct_array,'c') %Matlab versio +% c = struct_array.c; +% if ~isfield(c,'PARTITION') || ~isfield(c,'rowsFromInd') +% disp('Incorrect file format'); +% return +% end +% elseif isfield(struct_array,'PARTITION') %Mideva versio +% c = struct_array; +% if ~isfield(c,'rowsFromInd') +% disp('Incorrect file format'); +% return +% end +% else +% disp('Incorrect file format'); +% return; +% end +% +% if ~isfield(c, 'pointers') +% disp('Coordinate data missing from the result file'); +% return; +% end + +pointers = c.pointers; vorPoints = c.vorPoints; vorCells = c.vorCells; +coordinates = c.coordinates; names = c.names; +varmuus = c.varmuus; + +if isequal(c.mixtureType, 'pop') || isequal(c.mixtureType, 'spatialPop') + PARTITION = c.groupPartition; +else + PARTITION = c.PARTITION; +end + +talle = questdlg('Do you want names to be visible in the plot?', ... + 'Names visible?', 'Yes', 'No', 'Yes'); + +if isequal(talle,'No') + names = -1; +end + +plotVarmuus(vorPoints, vorCells, pointers, varmuus, coordinates, ... + PARTITION, names); + + + +%-------------------------------------------------------------------------- + +function viewPartitio + +h0 = findobj('Tag','load_menu'); +c = get(h0,'UserData'); + +% waitALittle; +% [filename, pathname] = uigetfile('*.mat', 'Load mixture clustering'); +% %load([pathname filename],'c'); +% if (sum(filename)==0) || (sum(pathname)==0) +% return; +% end +% struct_array = load([pathname filename]); +% if isfield(struct_array,'c') %Matlab versio +% c = struct_array.c; +% if ~isfield(c,'PARTITION') || ~isfield(c,'rowsFromInd') +% disp('Incorrect file format'); +% return +% end +% elseif isfield(struct_array,'PARTITION') %Mideva versio +% c = struct_array; +% if ~isfield(c,'rowsFromInd') +% disp('Incorrect file format'); +% return +% end +% else +% disp('Incorrect file format'); +% return; +% end + +if isequal(c.mixtureType, 'pop') || isequal(c.mixtureType, 'spatialPop') + viewPopMixPartition(c.groupPartition, c.rows, c.popnames); +elseif isequal(c.mixtureType, 'trained') + viewMixPartition(c.PARTITION, []); +else + viewMixPartition(c.PARTITION, c.popnames); +end + +function openHelpDoc +% s = fileparts(which('BAPS4manual.doc')); +% helpwin(s); +if strcmp(computer,'PCWIN') + % s = fileparts(which('baps4.exe')); + % winopen([s '\BAPS4manual.doc']); + winopen('BAPS5manual.doc'); +end + +function openHelpHtml +% web http://www.rni.helsinki.fi/~jic/bapspage.html +% web('http://www.rni.helsinki.fi/~jic/bapspage.html','-browser') +% web http://www.rni.helsinki.fi/~jic/bapspage.html -new; +if strcmp(computer,'PCWIN') + dos('start http://www.abo.fi/fak/mnf/mate/jc/software/baps.html'); % For the compiled version +end + +function openAboutWindow +info{1}=''; +info{2}='Bayesian Analysis of Population Structure (BAPS)'; +info{3}=''; +info{4}='Version 6.0'; +info{5}=''; +info{6}='Author: Jukka Corander, Pekka Marttinen, Jukka Siren, Jing Tang and Lu Cheng'; +info{7}=''; +info{8}='Copyright 2005-2012. All Rights Reserved'; +info{9}=''; +info{10}='Please view the reference page when using as part of research'; +info{11}='at http://www.helsinki.fi/bsg/software/BAPS'; +info{12} =''; +helpdlg(info,'About'); + +%-------------------------------------------------------------------------- + +function viewAdmixture + +% waitALittle; +% [filename, pathname] = uigetfile('*.mat', 'Load admixture results.'); +% if (sum(filename)==0) || (sum(pathname)==0) +% return; +% end +% %load([pathname filename],'c'); +% struct_array = load([pathname filename]); +disp('---------------------------------------------------'); +disp('Viewing the admixture result...'); +h0 = findobj('Tag','load_menu'); +c = get(h0,'UserData'); +h0 = findobj('Tag','filename1_text'); +filename = get(h0,'String'); + +% if isfield(struct_array,'c') %Matlab versio +% c = struct_array.c; +% if ~isfield(c,'proportionsIt') +% disp('*** ERROR: Incorrect file format'); +% return +% end +% elseif isfield(struct_array,'proportionsIt') %Mideva versio +% c = struct_array; +% if ~isfield(c,'proportionsIt') +% disp('*** ERROR: Incorrect file format'); +% return +% end +% else +% disp('*** ERROR: Incorrect file format'); +% return; +% end + +% mixtureType = c.mixtureType; +proportionsIt = c.proportionsIt; +popnames = c.popnames; partition = c.PARTITION; +mixtureType = c.mixtureType; +% if strcmp(mixtureType,'linkage_mix') % For bacterial clustering data +% if isempty(popnames) || size(popnames,1)==size(partition,1) +%if strcmp(mixtureType, 'admix') + if isempty(popnames) + ninds = size(partition,1); + popnames=cell(ninds,2); + for ind=1:ninds + popnames{ind,1}=cellstr(num2str(ind)); + end + popnames(:,2)=num2cell((1:ninds)'); + end + + npops = c.npops; % all the clusters including outliers + admixnpops = c.admixnpops; + + if ~isfield(c,'pvalue') % compatiable with old data + disp('*** WARNING: pvalue is not found in the admixture result.'); + disp('*** WARNING: Old admixture file.'); + pvalue = ones(size(partition,1),1); + else + pvalue = c.pvalue; + end + view_admixture(proportionsIt,npops,admixnpops, ... + popnames,partition,pvalue,filename); +%else +% disp('*** ERROR: incorrect admixture data.'); + % put which variable as the input? + % admixnpops = c.admixnpops; +% npops = c.npops; +% talle = questdlg(['Do you want individual names to be visible in the admixture ' ... +% 'result graphics?'], 'Names visible?', 'Yes', 'No', 'Yes'); +% if isequal(talle,'No') +% viewPartition2(proportionsIt, [], npops, partition, filename); +% else +% viewPartition2(proportionsIt, popnames, npops, partition, filename); +% end +% end + +%-------------------------------------------------------------------------- +function viewLoghood + view_loglikelihood; + +function viewEnergy + view_energy; + %-------------------------------------------------------------------------- + function viewGeneflow + view_geneflow; + +%-------------------------------------------------------------------------- +function changeColor() + h0 = findobj('Tag','base_figure'); + c = uisetcolor(h0,'Change color'); + h1 = findobj('Tag','datafile_text'); + h2 = findobj('Tag','outputfile_text'); + h3 = findobj('Tag','filename1_text'); + h4 = findobj('Tag','filename2_text'); + set(h1,'BackGroundColor',c); + set(h2,'BackGroundColor',c); + set(h3,'BackGroundColor',c); + set(h4,'BackGroundColor',c); + drawnow; +%----------------------------------------------------------------------- +function showmethebaps() +h0 = findobj('Tag','base_figure'); +%setWindowOnTop(h0,'true') +goToDefault +h0 = findobj('Tag','load_menu'); +set(h0,'UserData',[]); +%setWindowOnTop(h0,'false') + + +%----------------------------------------------------------------------- +function loadMixture + +waitALittle; +[filename, pathname] = uigetfile('*.mat', 'Load mixture result'); +%load([pathname filename],'c'); +if (sum(filename)==0) || (sum(pathname)==0) + return; +end +disp('---------------------------------------------------'); +disp('In loading the mixture result...'); +struct_array = load([pathname filename]); +if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'PARTITION') || ~isfield(c,'rowsFromInd') + disp('*** ERROR: Incorrect file format'); + return + end +elseif isfield(struct_array,'PARTITION') %Mideva versio + c = struct_array; + if ~isfield(c,'rowsFromInd') + disp('*** ERROR: Incorrect file format'); + return + end +else + disp('*** ERROR: Incorrect file format'); + return; +end +% Save gathered information to 'mixture_menu's UserData: +h0 = findobj('Tag','load_menu'); +set(h0,'UserData',c); +clear c; +%Set the name of the datafile to screen. +h1 = findobj('Tag','filename1_text'); +if exist('pathname') + filename = [pathname filename]; +end +set(h1,'String',filename); +h1 = findobj('Tag','datafile_text'); +set(h1,'String','Mixture result:'); +disp('Mixture result loaded.'); +goToMixtureAnalysis + +%-------------------------------------------------------------------------- +function goToMixtureAnalysis +set(findobj('Tag','graph_menu'), 'Enable','on'); +set(findobj('Tag','partitio_menu'), 'Enable','on'); +set(findobj('Tag','likelihood_menu'), 'Enable','on'); +set(findobj('Tag','energy_menu'), 'Enable','on'); +set(findobj('Tag','distances_menu'), 'Enable','on'); +set(findobj('Tag','kl_menu'), 'Enable','on'); +set(findobj('Tag','nei_menu'), 'Enable','on'); +set(findobj('Tag','close_menu'), 'Enable','on'); +set(findobj('Tag','phylogeny_menu'), 'Enable','on'); +set(findobj('Tag','upgma_menu'), 'Enable','on'); +set(findobj('Tag','nj_menu'), 'Enable','on'); +set(findobj('Tag','geneflow_menu'), 'Enable','off'); +set(findobj('Tag','admix_menu'), 'Enable','off'); +set(findobj('Tag','mutationplot_menu'), 'Enable','on'); + + + +%-------------------------------------------------------------------------- +function goToDefault +set(findobj('Tag','graph_menu'), 'Enable','off'); +set(findobj('Tag','distances_menu'), 'Enable','off'); + + + + +%-------------------------------------------------------------------------- +function loadSpatial +% Tekee tulostiedostosta voronoi tessellaation. + +waitALittle; +[filename, pathname] = uigetfile('*.mat', 'Load spatial mixture/admixture clustering'); +%load([pathname filename],'c'); +if (sum(filename)==0) || (sum(pathname)==0) + return; +end +struct_array = load([pathname filename]); +disp('---------------------------------------------------'); +disp('In loading the spatial mixture/admixture result...'); +if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'PARTITION') || ~isfield(c,'mixtureType') + disp('*** ERROR: Incorrect file format'); + return + end + if ~strcmp(c.mixtureType,'spatial') && ~strcmp(c.mixtureType,'spatialPop') + disp('*** ERROR: Incorrect file format'); + return + end +elseif isfield(struct_array,'PARTITION') %Mideva versio + c = struct_array; + if ~isfield(c,'rowsFromInd') + disp('*** ERROR: Incorrect file format'); + return + end +else + disp('*** ERROR: Incorrect file format'); + return; +end + +if ~isfield(c, 'pointers') + disp('*** ERROR: Coordinate data missing from the result file'); + return; +end + +% Save gathered information to 'spatialmixture_menu's UserData: +h0 = findobj('Tag','load_menu'); +set(h0,'UserData',c); + +%Set the name of the datafile to screen. +h1 = findobj('Tag','filename1_text'); +if exist('pathname') + filename = [pathname filename]; +end +set(h1,'String',filename); +h1 = findobj('Tag','datafile_text'); +if isfield(c,'admixnpops') + set(h1,'String','Spatial AdMixture Result:'); + disp('Spatial admixture result loaded.'); + goToSpatialAdMixtureAnalysis +else + set(h1,'String','Spatial Mixture Result:'); + disp('Spatial mixture result loaded.'); + goToSpatialMixtureAnalysis +end +clear c; + + +%-------------------------------------------------------------------------- +function goToSpatialMixtureAnalysis +set(findobj('Tag','graph_menu'), 'Enable','on'); +set(findobj('Tag','plot_coordinates_menu'), 'Enable','on'); +set(findobj('Tag','voronoi_menu'), 'Enable','on'); +set(findobj('Tag','varmuus_menu'), 'Enable','on'); +set(findobj('Tag','distances_menu'), 'Enable','on'); +set(findobj('Tag','kl_menu'), 'Enable','on'); +set(findobj('Tag','nei_menu'), 'Enable','on'); +set(findobj('Tag','close_menu'), 'Enable','on'); +set(findobj('Tag','geneflow_menu'), 'Enable','off'); +set(findobj('Tag','admix_menu'), 'Enable','off'); +set(findobj('Tag','likelihood_menu'), 'Enable','on'); +set(findobj('Tag','partitio_menu'), 'Enable','off'); + +%-------------------------------------------------------------------------- +function goToSpatialAdMixtureAnalysis +set(findobj('Tag','graph_menu'), 'Enable','on'); +set(findobj('Tag','plot_coordinates_menu'), 'Enable','on'); +set(findobj('Tag','voronoi_menu'), 'Enable','on'); +set(findobj('Tag','varmuus_menu'), 'Enable','on'); +set(findobj('Tag','distances_menu'), 'Enable','on'); +set(findobj('Tag','kl_menu'), 'Enable','on'); +set(findobj('Tag','nei_menu'), 'Enable','on'); +set(findobj('Tag','close_menu'), 'Enable','on'); +set(findobj('Tag','geneflow_menu'), 'Enable','off'); +set(findobj('Tag','admix_menu'), 'Enable','on'); +set(findobj('Tag','likelihood_menu'), 'Enable','on'); +set(findobj('Tag','partitio_menu'), 'Enable','off'); +%-------------------------------------------------------------------------- +function closeFile +h0 = findobj('Tag','load_menu'); +set(h0,'UserData',[]); +h0 = findobj('Tag','datafile_text'); +set(h0,'String','Data File:'); +h0 = findobj('Tag','filename1_text'); +set(h0,'String',''); + +set(findobj('Tag','close_menu'), 'Enable','off'); +set(findobj('Tag','graph_menu'), 'Enable','off'); +set(findobj('Tag','plot_coordinates_menu'), 'Enable','off'); +set(findobj('Tag','partitio_menu'), 'Enable','off'); +set(findobj('Tag','likelihood_menu'), 'Enable','off'); +set(findobj('Tag','admix_menu'), 'Enable','off'); +set(findobj('Tag','geneflow_menu'), 'Enable','off'); +set(findobj('Tag','voronoi_menu'), 'Enable','off'); +set(findobj('Tag','varmuus_menu'), 'Enable','off'); + +set(findobj('Tag','distances_menu'), 'Enable','off'); +set(findobj('Tag','kl_menu'), 'Enable','off'); +set(findobj('Tag','nei_menu'), 'Enable','off'); + + +%----------------------------------------------------------------------- +function loadAdmixture +waitALittle; +[filename, pathname] = uigetfile('*.mat', 'Load admixture results.'); +if (sum(filename)==0) || (sum(pathname)==0) + return; +end +%load([pathname filename],'c'); +disp('---------------------------------------------------'); +disp('In loading the admixture result...'); +struct_array = load([pathname filename]); + +if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'proportionsIt') + disp('*** ERROR: Incorrect file format'); + return + end +elseif isfield(struct_array,'proportionsIt') %Mideva versio + c = struct_array; + if ~isfield(c,'proportionsIt') + disp('*** ERROR: Incorrect file format'); + return + end +elseif isfield(struct_array,'tietue') + c = struct_array.tietue; + if ~isfield(c,'proportionsIt') + disp('*** ERROR: Incorrect file format'); + return + end +else + disp('*** ERROR: Incorrect file format'); + return; +end + +% Save gathered information to 'mixture_menu's UserData: +h0 = findobj('Tag','load_menu'); +set(h0,'UserData',c); +clear c; +%Set the name of the datafile to screen. +h1 = findobj('Tag','filename1_text'); +if exist('pathname') + filename = [pathname filename]; +end +set(h1,'String',filename); +h1 = findobj('Tag','datafile_text'); +set(h1,'String','Admixture result:'); +disp('Admixture result loaded.'); +goToAdmixtureAnalysis + +%-------------------------------------------------------------------------- +function goToAdmixtureAnalysis +set(findobj('Tag','graph_menu'), 'Enable','on'); +set(findobj('Tag','admix_menu'), 'Enable','on'); +set(findobj('Tag','geneflow_menu'), 'Enable','on'); +% set(findobj('Tag','distances_menu'), 'Enable','on'); +% set(findobj('Tag','kl_menu'), 'Enable','on'); +% set(findobj('Tag','nei_menu'), 'Enable','on'); +set(findobj('Tag','close_menu'), 'Enable','on'); +% set(findobj('Tag','likelihood_menu'), 'Enable','on'); + + +%-------------------------------------------------------------------------- +function calculateDis(type) +if exist('baps4_output.baps','file') + delete('baps4_output.baps') +else + diary('baps4_output.baps') +end + +h0 = findobj('Tag','load_menu'); +c = get(h0,'UserData'); +npops = c.npops; +COUNTS = c.COUNTS; +adjprior = c.adjprior; +data = c.data; +partition = c.PARTITION; +clear c; + +if npops > 1 + dist_mat = zeros(npops, npops); + maxnoalle = size(COUNTS,1); + nloci = size(COUNTS,2); + d = zeros(maxnoalle, nloci, npops); + + + switch type + case 'KL' + prior = adjprior; + prior(find(prior==1))=0; + nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. + prior(1,nollia)=1; + for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); + end + + ekarivi = num2str(npops); + disp('--------------------------------------'); + disp('KL-divergence matrix in PHYLIP format:'); + disp('--------------------------------------'); + disp(ekarivi); + for pop1 = 1:npops + % rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + dist_mat(pop1,pop2) = div; + end + end + + case 'Nei' + for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); + end + + ekarivi = num2str(npops); + disp('--------------------------------------'); + disp('Nei-divergence matrix in PHYLIP format:'); + disp('--------------------------------------'); + disp(ekarivi); + for pop1 = 1:npops + % rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div1 = sum(sum(dist1.*dist2)); + div2 = sqrt(sum(sum(dist1.^2)))*sqrt(sum(sum(dist2.^2))); + div = -log(div1/div2); + dist_mat(pop1,pop2) = div; + end + end + case 'Hamming' + ekarivi = num2str(npops); + disp('--------------------------------------'); + disp('Hamming distance matrix in PHYLIP format:'); + disp('--------------------------------------'); + disp(ekarivi); + for pop1 = 1:npops + for pop2 = 1:pop1-1 + dist_mat(pop1,pop2) = hamming_dist(data(logical(partition==pop1),[1:end-1]),... + data(logical(partition==pop2),[1:end-1])); + end + end + end + +end + +dist_mat = dist_mat + dist_mat'; % make it symmetric +for pop1 = 1:npops + rivi = ['Cluster_' num2str(pop1) ' ']; + for pop2 = 1:npops + rivi = [rivi kldiv2str(dist_mat(pop1,pop2)) ' ']; + end + disp(rivi); +end +diary off + +% --------------------------------------------------------------------- +% Save the result. +% Jing - 26.12.2005 +talle = questdlg(['Do you want to save the distance matrix in PHYLIP format? '], ... + 'Save distance matrix?','Yes','No','Yes'); +if isequal(talle,'Yes') + %%%waitALittle; + [filename, pathname] = uiputfile('*.txt','Save results as'); + + if (sum(filename)==0) || (sum(pathname)==0) + % Cancel was pressed + return + else % copy 'baps4_output.baps' into the text file with the same name. + if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',[pathname filename]) + delete('baps4_output.baps') + end + end +else + delete('baps4_output.baps') +end + +% ------------------------------------------------------------------------- +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + +% ------------------------------------------------------------------------- +function dist = hamming_dist(data1,data2) +[length1,nloci] = size(data1); +length2 = size(data2,1); +dist1 = 0; +for i = 1:length1 + dist2 = 0; + for j = 1:length2 + dist2 = dist2 + sum(data1(i,:)~=data2(j,:))/nloci; + end + dist1 = dist1 + dist2/length2; +end +dist = dist1/length1; +%-------------------------------------------------------------------------- +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:in? +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmill? +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + + + diff --git a/matlab/general/baps6.m b/matlab/general/baps6.m new file mode 100644 index 0000000..7586873 --- /dev/null +++ b/matlab/general/baps6.m @@ -0,0 +1,377 @@ +function baps5(varargin) +% This is the machine-generated representation of a Handle Graphics object +% and its children. Note that handle values may change when these objects +% are re-created. This may cause problems with any callbacks written to +% depend on the value of the handle at the time the object was saved. +% +% To reopen this object, just type the name of the M-file at the MATLAB +% prompt. The M-file and its associated MAT-file must be on your path. + +% NEW: script version was added. 09.07.2007 + +% Script function has been added by Lu Cheng, 11.03.2010 +% The script function is designed for semi-supervised clustering in the +% trained clustering module, it adopts a codon linkage model for the DNA +% sequence data +% The calling command: baps5 -semi script_file + +global PARAMETERS; PARAMETERS = []; +global SCRIPT_MODE; SCRIPT_MODE = []; + +if nargin == 1 % script version + readScript(varargin{1}); + return +else if nargin==2 + %----------------added by Lu Cheng, 11.03.2010 + if isequal(varargin{1},'-semi'); + script_file = varargin{2}; + PARAMETERS = semiReadScript(script_file); + SCRIPT_MODE = true; + trainedMix + SCRIPT_MODE = []; + PARAMETERS = []; + else + disp(cat(2,'Unknown option: ',varargin{1})); + end + return + %--------------------------------------------- + end +end + +% load baps4 + +% Base frame +h0 = figure('Color',[0.8 0.8 0.8], ... + 'MenuBar','none', ... + 'Position',[364 175 750 500], ... + 'Resize','on', ... + 'Tag','base_figure',... + 'Name','Bayesian Analysis of Population Structure',... + 'NumberTitle','off',... + 'Color',[.7 .9 .7]); + +% File menu +h1 = uimenu('Parent',h0, ... + 'Label','File', ... + 'Tag','file_menu'); +h2 = uimenu('Parent',h1, ... + 'Label','Load Result', ... + 'Tag','load_menu'); +h3 = uimenu('Parent',h2, ... + 'Callback','baps4cbf load_mixture_menu', ... + 'Label','Mixture result', ... + 'Tag','mixture_menu'); +h3 = uimenu('Parent',h2, ... + 'Callback','baps4cbf load_spatial_menu', ... + 'Label','Spatial mixture result', ... + 'Tag','spatialmixture_menu'); +h3 = uimenu('Parent',h2, ... + 'Callback','baps4cbf load_admixture_menu', ... + 'Label','Admixture result', ... + 'Tag','admixture_menu'); +h2 = uimenu('Parent',h1, ... + 'Enable','on',... + 'Label','Compare Results', ... + 'Tag','compare_menu'); +h3 = uimenu('Parent',h2, ... + 'Callback','baps4cbf compare_menu', ... + 'Label','Mixture results', ... + 'Tag','compare_menu'); +h3 = uimenu('Parent',h2, ... + 'Callback','baps4cbf compare_admix_menu', ... + 'Label','AdMixture results', ... + 'Tag','compare_admix_menu'); +h2 = uimenu('Parent',h1, ... + 'Enable','off',... + 'Callback','baps4cbf close_menu', ... + 'Label','Close Results', ... + 'Tag','close_menu'); +h2 = uimenu('Parent',h1, ... + 'Label','Output File', ... + 'Tag','output_menu'); +h3 = uimenu('Parent',h2, ... + 'Callback','baps4cbf output_menu', ... + 'Label','Set', ... + 'Tag','set_menu'); +h3 = uimenu('Parent',h2, ... + 'Callback','baps4cbf remove_outputfile_menu', ... + 'Label','Remove', ... + 'Tag','remove_menu'); +h2 = uimenu('Parent',h1, ... + 'Callback','baps4cbf exit_menu', ... + 'Label','Exit', ... + 'Tag','exit_menu'); + +% Distances menu +h1 = uimenu('Parent',h0, ... + 'Enable','off',... + 'Label','Distances', ... + 'Tag','distances_menu'); +h2 = uimenu('Parent',h1, ... + 'Callback','baps4cbf calculate_kl', ... + 'Label','KL distance', ... + 'Tag','kl_menu'); +h2 = uimenu('Parent',h1, ... + 'Callback','baps4cbf calculate_nei', ... + 'Label','Nei distance', ... + 'Tag','nei_menu'); +h2 = uimenu('Parent',h1, ... + 'Callback','baps4cbf calculate_hamming', ... + 'Label','Hamming distance', ... + 'Tag','hamming_menu'); + +% Graph menu +h1 = uimenu('Parent',h0, ... + 'Enable','off', ... + 'Label','Graph', ... + 'Tag','graph_menu'); +h2 = uimenu('Parent',h1, ... + 'Enable','off', ... + 'Callback','baps4cbf plot_coordinates_menu', ... + 'Label','Plot Coordinates', ... + 'Tag','plot_coordinates_menu'); +h2 = uimenu('Parent',h1, ... + 'Enable','off', ... + 'Callback','baps4cbf partitio_menu', ... + 'Label','View Partition', ... + 'Tag','partitio_menu'); +h2 = uimenu('Parent',h1, ... + 'Enable','off', ... + 'Callback','baps4cbf admix_menu', ... + 'Label','View Admixture Results', ... + 'Tag','admix_menu'); +h2 = uimenu('Parent',h1, ... + 'Enable','off', ... + 'Callback','baps4cbf likelihood_menu', ... + 'Label','Changes of Log Likelihood', ... + 'Tag','likelihood_menu'); +h2 = uimenu('Parent',h1, ... + 'Enable','off', ... + 'Callback','baps4cbf energy_menu', ... + 'Label','Energy landscape', ... + 'Tag','energy_menu'); +h2 = uimenu('Parent',h1, ... + 'Enable','off', ... + 'Callback','baps4cbf geneflow_menu', ... + 'Label','Plot Gene Flow', ... + 'Tag','geneflow_menu'); +h2 = uimenu('Parent',h1, ... + 'Enable','off', ... + 'Callback','baps4cbf voronoi_menu', ... + 'Label','Voronoi Tessellation', ... + 'Tag','voronoi_menu'); +h2 = uimenu('Parent',h1, ... + 'Enable','off', ... + 'Callback','baps4cbf varmuus_menu', ... + 'Label','Local Uncertainty', ... + 'Tag','varmuus_menu'); +h2 = uimenu('Parent',h1, ... + 'Enable','off', ... + 'Callback','baps4cbf phylogeny_menu', ... + 'Label','Phylogeny', ... + 'Tag','phylogeny_menu'); +h3 = uimenu('Parent',h2, ... + 'Enable','off', ... + 'Callback','baps4cbf upgma_menu',... + 'Label', 'UPGMA', ... + 'Tag','upgma_menu'); +h3 = uimenu('Parent',h2, ... + 'Enable','off', ... + 'Callback','baps4cbf nj_menu',... + 'Label', 'Neighbor-Joining', ... + 'Tag','nj_menu'); +h2 = uimenu('Parent',h1, ... + 'Enable','off', ... + 'Callback','baps4cbf mutationplot_menu', ... + 'Label','Mutation Plot', ... + 'Tag','mutationplot_menu'); + +% Tools menu +h1 = uimenu('Parent',h0, ... + 'Label','Tools', ... + 'Tag','tools_menu'); +h2 = uimenu('Parent',h1, ... + 'Callback','baps4cbf fixk_menu', ... + 'Label','Enable Fixed-K Clustering', ... + 'Tag','fixk_menu', ... + 'Userdata', 0); +h2 = uimenu('Parent',h1, ... + 'Callback','baps4cbf partitioncompare_menu', ... + 'Label','Partition Compare', ... + 'Tag','partitioncompare_menu', ... + 'Userdata', []); +h2 = uimenu('Parent',h1, ... + 'Callback','baps4cbf loadfigure_menu', ... + 'Label','Load Figure', ... + 'Tag','loadfigure_menu'); +h2 = uimenu('Parent',h1, ... + 'Callback','baps4cbf changecolor_menu',... + 'Label', 'Change BAPS Color', ... + 'Tag','changecolor_menu'); + + +% Help menu +h1 = uimenu('Parent',h0, ... + 'Label','Help', ... + 'Tag','help_menu'); +h2 = uimenu('Parent',h1, ... + 'Callback','baps4cbf helpdoc', ... + 'Enable','on', ... + 'Label','Help contents (doc)', ... + 'Tag','helpman_menu'); +h2 = uimenu('Parent',h1, ... + 'Callback','baps4cbf helponline', ... + 'Enable','on', ... + 'Label','BAPS on the web', ... + 'Tag','online_menu'); +h2 = uimenu('Parent',h1, ... + 'Callback','baps4cbf about', ... + 'Enable','on', ... + 'Label','About', ... + 'Tag','about_menu'); + + +% uicontrol +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'BackgroundColor',[0.82 0.82 0.77], ... + 'ListboxTop',0, ... + 'Position',[38.25 154.268575851393 381 148.721320743034], ... + 'Style','frame', ... + 'Tag','mix_frame'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'BackgroundColor',[0.82 0.82 0.77], ... + 'ListboxTop',0, ... + 'Position',[38.25 56 381 79.721320743034], ... + 'Style','frame', ... + 'Tag','admix_frame'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'BackgroundColor',[0.82 0.82 0.77], ... + 'ListboxTop',0, ... + 'Position',[156.75 279.545624148607 158 11.9465944272446], ... + 'String','Population mixture analysis', ... + 'Style','text', ... + 'Tag','mix_text'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'BackgroundColor',[0.82 0.82 0.77], ... + 'ListboxTop',0, ... + 'Position',[156.75 111.545624148607 138 11.9465944272446], ... + 'String','Population admixture analysis', ... + 'Style','text', ... + 'Tag','admix_text'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'Callback','baps4cbf mix1_button', ... + 'ListboxTop',0, ... + 'Position',[51.75 245.412497213622 163.5 22.7554179566563], ... + 'String','Clustering of individuals', ... + 'Tag','mix1_button'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'Callback','baps4cbf mix2_button', ... + 'ListboxTop',0, ... + 'Position',[51.75 212.75 163.5 23.25], ... + 'String','Clustering of groups of individuals', ... + 'Tag','mix2_button'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'Callback','baps4cbf trained_button', ... + 'ListboxTop',0, ... + 'Position',[51.75 178.284055727554 163.5 22.7554179566563], ... + 'String','Trained clustering', ... + 'Tag','trained_button'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'Callback','baps4cbf admix1_button', ... + 'ListboxTop',0, ... + 'Position',[51.7 75.412497213622 163.5 22.7554179566563], ... + 'String','Admixture based on mixture clustering', ... + 'Tag','admix1_button'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'Callback','baps4cbf admix2_button', ... + 'ListboxTop',0, ... + 'Position',[245.75 75.412497213622 163.5 22.7554179566563], ... + 'String','Admixture based on pre-defined populations', ... + 'Tag','admix2_button'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'BackgroundColor',[.7 .9 .7], ... + 'FontSize',10, ... + 'HorizontalAlignment','left', ... + 'ListboxTop',0, ... + 'Position',[36 28.5 100.75 14.25], ... + 'String','Data File:', ... + 'Style','text', ... + 'Tag','datafile_text'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'BackgroundColor',[.7 .9 .7], ... + 'FontSize',10, ... + 'HorizontalAlignment','left', ... + 'ListboxTop',0, ... + 'Position',[36 9.75 51 14.25], ... + 'String','Output File:', ... + 'Style','text', ... + 'Tag','outputfile_text'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'BackgroundColor',[.7 .9 .7], ... + 'FontSize',10, ... + 'HorizontalAlignment','left', ... + 'ListboxTop',0, ... + 'Position',[140.75 28.5 314.5 14.25], ... + 'Style','text', ... + 'Tag','filename1_text'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'BackgroundColor',[.7 .9 .7], ... + 'FontSize',10, ... + 'HorizontalAlignment','left', ... + 'ListboxTop',0, ... + 'Position',[93.75 9.75 314.5 14.25], ... + 'Style','text', ... + 'Tag','filename2_text'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'Callback','baps4cbf spatial_button', ... + 'ListboxTop',0, ... + 'Position',[245.75 245.412497213622 163.5 22.7554179566563], ... + 'String','Spatial clustering of individuals', ... + 'Tag','spatial_button'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'Callback','baps4cbf spatial2_button', ... + 'ListboxTop',0, ... + 'Position',[245.75 212.75 163.5 23.25], ... + 'String','Spatial clustering of groups', ... + 'Tag','spatial2_button'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'Callback','baps4cbf linkage_button', ... + 'ListboxTop',0, ... + 'Position',[245.75 178.284055727554 163.5 23.25], ... + 'String','Clustering with linked loci', ... + 'Tag','linkage_button'); + +% Starting text in the log window: +disp('**********************************************************'); +disp('********************** BAPS 6.0 *************************'); +disp('**********************************************************'); +disp(' '); +disp('Closing this window will end the execution of the program.'); +disp('However, it is recommended that you use ''Exit'' on the ''File'''); +disp('menu of the main window or use the close-button in the upper'); +disp('right-hand corner of the main window to exit the program.'); +disp(' '); + +if strcmp(computer,'PCWIN') || strcmp(computer,'PCWIN64') + if nargout > 0, fig = h0; end + drawnow + % seticon(h0, ['baps5.ico']); +end +% uimenufcn(gcf,'WindowCommandWindow') + diff --git a/matlab/general/distance.m b/matlab/general/distance.m new file mode 100644 index 0000000..7d42e27 --- /dev/null +++ b/matlab/general/distance.m @@ -0,0 +1,46 @@ +function d = distance(a,b) +% DISTANCE - computes Euclidean distance matrix +% +% E = distance(A,B) +% +% A - (DxM) matrix +% B - (DxN) matrix +% +% Returns: +% E - (MxN) Euclidean distances between vectors in A and B +% +% +% Description : +% This fully vectorized (VERY FAST!) m-file computes the +% Euclidean distance between two vectors by: +% +% ||A-B|| = sqrt ( ||A||^2 + ||B||^2 - 2*A.B ) +% +% Example : +% A = rand(400,100); B = rand(400,200); +% d = distance(A,B); + +% Author : Roland Bunschoten +% University of Amsterdam +% Intelligent Autonomous Systems (IAS) group +% Kruislaan 403 1098 SJ Amsterdam +% tel.(+31)20-5257524 +% bunschot@wins.uva.nl +% Last Rev : Oct 29 16:35:48 MET DST 1999 +% Tested : PC Matlab v5.2 and Solaris Matlab v5.3 +% Thanx : Nikos Vlassis + +% Copyright notice: You are free to modify, extend and distribute +% this code granted that the author of the original code is +% mentioned as the original author of the code. + +if (nargin ~= 2) + error('Not enough input arguments'); +end + +if (size(a,1) ~= size(b,1)) + error('A and B should be of same dimensionality'); +end + +aa=sum(a.*a,1); bb=sum(b.*b,1); ab=a'*b; +d = sqrt(abs(repmat(aa',[1 size(bb,2)]) + repmat(bb,[size(aa,2) 1]) - 2*ab)); diff --git a/matlab/general/fixKWarning.m b/matlab/general/fixKWarning.m new file mode 100644 index 0000000..d83fed9 --- /dev/null +++ b/matlab/general/fixKWarning.m @@ -0,0 +1,14 @@ +%------------------------------------------------------------------------- +function fixedK = fixKWarning() +% The function is to ask further confirmation of running fixed K clustering +button = questdlg('You have selected a clustering analysis with the Fixed-K Mode. Are you sure to continue?',... + 'Fixed-K Mode'); +switch button + case 'Yes' + fixedK = 1; + case 'No' + fixedK = 0; + goToFixedK + case 'Cancel' + fixedK = 0.5; +end \ No newline at end of file diff --git a/matlab/general/goToFixedK.m b/matlab/general/goToFixedK.m new file mode 100644 index 0000000..8510d05 --- /dev/null +++ b/matlab/general/goToFixedK.m @@ -0,0 +1,28 @@ +%------------------------------------------------------------------------- + +function goToFixedK +% The function to do fix K algorithm + +h0 = findobj('Tag','fixk_menu'); +h1 = findobj('Tag','mix_text'); +old = get(h0, 'UserData'); +if old == 0 + set(h0,'UserData',1, 'label', 'Disable Fixed-K Clustering'); + set(h1,'String', 'Population mixture analysis (Fixed-K Mode)'); + % disable all the non-relevant buttons + set(findobj('Tag','partitioncompare_menu'),'Enable','off'); +% set(findobj('Tag','file_menu'),'Enable','off'); +% set(findobj('Tag','admix_text'),'Enable','off'); +% set(findobj('Tag','admix1_button'),'Enable','off'); +% set(findobj('Tag','admix2_button'),'Enable','off'); + disp('Fixed-K Mode is enabled.'); +else + set(h0,'UserData',0, 'label', 'Enable Fixed-K Clustering'); + set(h1,'String', 'Population mixture analysis'); + set(findobj('Tag','partitioncompare_menu'),'Enable','on'); +% set(findobj('Tag','file_menu'),'Enable','on'); +% set(findobj('Tag','admix_text'),'Enable','on'); +% set(findobj('Tag','admix1_button'),'Enable','on'); +% set(findobj('Tag','admix2_button'),'Enable','on'); + disp('Fixed-K Mode is disabled.'); +end diff --git a/matlab/general/goToPartitionCompare.m b/matlab/general/goToPartitionCompare.m new file mode 100644 index 0000000..6523a88 --- /dev/null +++ b/matlab/general/goToPartitionCompare.m @@ -0,0 +1,138 @@ +function goToPartitionCompare +% GOTOPARTITIONCOMPARE goes to the partition comaparing mode + + +% Load the partition result +[filename, pathname] = uigetfile('*.*', 'Load the priori specified partitions'); +if (sum(filename)==0) || (sum(pathname)==0) + return; +end +disp('---------------------------------------------------'); +disp('In loading the partition result...'); +try + c = load([pathname filename]); +catch + fprintf(1,'***ERROR: incorrect partition result.\n'); + return +end +if sum(c(1,:))~=1 + fprintf(1,'***ERROR: invalid prior density.\n'); + return +else + prior = c(1,:); + c = c([2:end],:); +end +[ninds, npartitions] = size(c); +fprintf(1,'# of sampling units: %d\n', ninds); +fprintf(1,'# of partitions in comparision: %d\n', npartitions); + + + +h1 = findobj('Tag','partitioncompare_menu'); + +% Choose data type and model type +items(1).name = 'Model:'; +items(1).default = 1; +items(1).indent = 1; +items(1).values = {'Independent';'Spatial';'Linkage'}; +items(1).linked = [2 3 4]; + +items(2).name = 'Data type'; +items(2).indent = 1; +items(2).values = {1}; +items(3).name = 'individual level'; +items(3).default = 1; +items(3).exclusive = 4; +items(3).indent = 2; +items(4).name = 'group level'; +items(4).default = 0; +items(4).exclusive = 3; +items(4).indent = 2; + +title = 'Specify data and model types'; +out = CSEFlagDialog1(items, title); +if isempty(out) + disp(['cancelled.']); + return +end + +userdata.partitions = c; +userdata.logmls = []; +set(h1,'UserData',userdata); + +try + if out(1).answer == 1 + if out(3).answer == 1 % independent model with individual level + disp(['Model: Independent clustering - Individual level']); + greedyMix(-1); + else % independent model with group level + disp(['Model: Independent clustering - Group level']); + greedyPopMix(); + end + + elseif out(1).answer == 2 + if out(3).answer == 1 % spatial model with individual level + disp(['Model: Spatial clustering - Individual level']); + spatialMixture; + else + disp(['Model: Spatial clustering - Group level']); + spatialPopMixture; + end + + elseif out(1).answer == 3 + disp(['Model: Linkage clustering']); + linkageMixture_speed; + + end + + userdata = get(h1,'UserData'); + logmls = userdata.logmls; + if isempty(logmls) + disp(['*** ERROR: program stopped.']); + set(findobj('Tag','filename1_text'),'String',[]); + else + diary('baps5_partitioncompare.out'); + disp('---------------------------------------------------'); + disp(['Partition Prior LogLikelihood Posterior']); + posterior = zeros(1, npartitions); + sum_posterior = exp(logmls)*prior'; + if sum_posterior == 0 % meaning that one partition dominates + dominate_partition = find(logmls==max(logmls)); + posterior(dominate_partition) = 1; + else + for i = 1:npartitions + posterior(i) = exp(userdata.logmls(i))*prior(i)/sum_posterior; + end + end + + for i = 1:npartitions + disp([' ' ownNum2Str(i) ' ' ownNum2Str(prior(i)) ' ' ... + ownNum2Str(logmls(i)) ' ' ownNum2Str(posterior(i))]); + end + diary off + + save_preproc = questdlg('Do you wish to save the partition compare result?',... + 'Save result?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + waitALittle; + [filename, pathname] = uiputfile('*.txt','Save result as'); + if (sum(filename)==0) || (sum(pathname)==0) + % Cancel was pressed + return; + else + % copy 'baps4_output.baps' into the text file with the same name. + if exist('baps5_partitioncompare.out','file') + copyfile('baps5_partitioncompare.out',[pathname filename]) + delete('baps5_partitioncompare.out') + disp('result saved.'); + else + disp('*** ERROR: result cannot be saved.'); + end + end; + end + end +catch + disp('*** ERROR: incorrect format. Check model and data types'); +end +set(h1,'UserData',[]); diff --git a/matlab/general/iconn.m b/matlab/general/iconn.m new file mode 100644 index 0000000..d458c64 --- /dev/null +++ b/matlab/general/iconn.m @@ -0,0 +1,18 @@ +function iconn +% Change the icon of all open figures to match the figure number +% +% iconn +% +% See also: seticon +% +% In order to have this feature automatically on for all new figures +% execute the following command: +% set(0,'defaultfigurecreatefcn','iconn') +% You may want to insert it into your startup.m file + +% Copyright 2000-2002, Research and Development + +h = get(0,'children'); +for i=1:min(length(h), 9) + seticon(i, which(sprintf('icon%d.ico', i))) +end diff --git a/matlab/general/imageCbf.m b/matlab/general/imageCbf.m new file mode 100644 index 0000000..d574601 --- /dev/null +++ b/matlab/general/imageCbf.m @@ -0,0 +1,67 @@ +function imageCbf(action) +%Tahan funktioon ohjataan image_figuren callback:it. + +switch action + +case 'save_image' + saveImage; + +case 'export_bmp' + export('bmp'); + +case 'export_jpg' + export('jpg'); + +end + +function saveImage +%Saves information needed to reconstruct the image later. + +[filename,pathname] = uiputfile('*.mat','Save Figure'); +if (filename == 0) & (pathname == 0) + %Cancel was clicked. + return; +end; +image_file_name = [pathname filename]; +tiedot = get(gcbf,'UserData'); +% tiedot on tietue, joka sisältää info:n ja popnames:in + +% save(image_file_name,'tiedot'); +save(image_file_name,'tiedot','-v7.3'); % added by Lu Cheng, 08.06.2012 + + +%------------------------------------------------------------------- + + +function export(format) +%Saves a figure in a format which has been given +%as a parameter. Exported images cannot be opened using BAPS. + +[filename,pathname] = uiputfile(['*.' format], ['Export to ' format]); +if (filename == 0) & (pathname == 0) + %Cancel was pressed: + return; +end; +filename = checkTheFormat(format,filename); +resultfilename = [pathname filename]; +print(resultfilename); + +%--------------------------------------------------------- + +function newfilename = checkTheFormat(format,oldfilename) +%Checks if the 'oldfilename' has ending *.'format'. If not, ending +%will be added to newfilename. + +if length(oldfilename) < 4 + newfilename = [oldfilename '.' format]; +elseif isequal(oldfilename(end-3: end), ['.' format]) + newfilename = oldfilename; +elseif any(oldfilename == '.') + n = 1; + while ~isequal(oldfilename(n),'.') + n = n+1; + end; + newfilename = [oldfilename(1:n) format]; +else + newfilename = [oldfilename '.' format]; +end; \ No newline at end of file diff --git a/matlab/general/image_figure.m b/matlab/general/image_figure.m new file mode 100644 index 0000000..9e42c34 --- /dev/null +++ b/matlab/general/image_figure.m @@ -0,0 +1,38 @@ +function fig = image_figure() +% This is the machine-generated representation of a Handle Graphics object +% and its children. Note that handle values may change when these objects +% are re-created. This may cause problems with any callbacks written to +% depend on the value of the handle at the time the object was saved. +% +% To reopen this object, just type the name of the M-file at the MATLAB +% prompt. The M-file and its associated MAT-file must be on your path. + +load image_figure + +h0 = figure('Color',[0.8 0.8 0.8], ... + 'Colormap',mat0, ... + 'MenuBar','none', ... + 'NumberTitle','off', ... + 'PointerShapeCData',mat1, ... + 'Position',[73 27 896 672], ... + 'Resize','on', ... + 'Tag','image_figure'); +h1 = uimenu('Parent',h0, ... + 'Label','File', ... + 'Tag','image_fig_file_menu'); +h2 = uimenu('Parent',h1, ... + 'Callback','imageCbf save_image', ... + 'Label','Save Figure', ... + 'Tag','save_image_menu'); +h2 = uimenu('Parent',h1, ... + 'Label','Export', ... + 'Tag','export_image_menu'); +h3 = uimenu('Parent',h2, ... + 'Callback','imageCbf export_jpg', ... + 'Label','*.jpg', ... + 'Tag','jpg_menu'); +h3 = uimenu('Parent',h2, ... + 'Callback','imageCbf export_bmp', ... + 'Label','*.bmp', ... + 'Tag','bmp_menu'); +if nargout > 0, fig = h0; end diff --git a/matlab/general/kstest2.m b/matlab/general/kstest2.m new file mode 100644 index 0000000..a20ea83 --- /dev/null +++ b/matlab/general/kstest2.m @@ -0,0 +1,178 @@ +function [H, pValue, KSstatistic] = kstest2(x1, x2, alpha, tail) +%KSTEST2 Two-sample Kolmogorov-Smirnov goodness-of-fit hypothesis test. +% H = KSTEST2(X1,X2,ALPHA,TAIL) performs a Kolmogorov-Smirnov (K-S) test +% to determine if independent random samples, X1 and X2, are drawn from +% the same underlying continuous population. ALPHA and TAIL are optional +% scalar inputs: ALPHA is the desired significance level (default = 0.05); +% TAIL indicates the type of test (default = 0). H indicates the result of +% the hypothesis test: +% H = 0 => Do not reject the null hypothesis at significance level ALPHA. +% H = 1 => Reject the null hypothesis at significance level ALPHA. +% +% Let S1(x) and S2(x) be the empirical distribution functions from the +% sample vectors X1 and X2, respectively, and F1(x) and F2(x) be the +% corresponding true (but unknown) population CDFs. The two-sample K-S +% test tests the null hypothesis that F1(x) = F2(x) for all x, against the +% alternative specified by TAIL: +% 'unequal' -- "F1(x) not equal to F2(x)" (two-sided test) +% 'larger' -- "F1(x) > F2(x)" (one-sided test) +% 'smaller' -- "F1(x) < F2(x)" (one-sided test) +% +% For TAIL = 'unequal', 'larger', and 'smaller', the test statistics are +% max|S1(x) - S2(x)|, max[S1(x) - S2(x)], and max[S2(x) - S1(x)], +% respectively. +% +% The decision to reject the null hypothesis occurs when the significance +% level, ALPHA, equals or exceeds the P-value. +% +% X1 and X2 are vectors of lengths N1 and N2, respectively, and represent +% random samples from some underlying distribution(s). Missing +% observations, indicated by NaNs (Not-a-Number), are ignored. +% +% [H,P] = KSTEST2(...) also returns the asymptotic P-value P. +% +% [H,P,KSSTAT] = KSTEST2(...) also returns the K-S test statistic KSSTAT +% defined above for the test type indicated by TAIL. +% +% The asymptotic P-value becomes very accurate for large sample sizes, and +% is believed to be reasonably accurate for sample sizes N1 and N2 such +% that (N1*N2)/(N1 + N2) >= 4. +% +% See also KSTEST, LILLIETEST, CDFPLOT. +% + +% Copyright 1993-2007 The MathWorks, Inc. +% $Revision: 1.5.2.5 $ $ Date: 1998/01/30 13:45:34 $ + +% References: +% Massey, F.J., (1951) "The Kolmogorov-Smirnov Test for Goodness of Fit", +% Journal of the American Statistical Association, 46(253):68-78. +% Miller, L.H., (1956) "Table of Percentage Points of Kolmogorov Statistics", +% Journal of the American Statistical Association, 51(273):111-121. +% Stephens, M.A., (1970) "Use of the Kolmogorov-Smirnov, Cramer-Von Mises and +% Related Statistics Without Extensive Tables", Journal of the Royal +% Statistical Society. Series B, 32(1):115-122. +% Conover, W.J., (1980) Practical Nonparametric Statistics, Wiley. +% Press, W.H., et. al., (1992) Numerical Recipes in C, Cambridge Univ. Press. + +if nargin < 2 + error('stats:kstest2:TooFewInputs','At least 2 inputs are required.'); +end + +% +% Ensure each sample is a VECTOR. +% + +if ~isvector(x1) || ~isvector(x2) + error('stats:kstest2:VectorRequired','The samples X1 and X2 must be vectors.'); +end + +% +% Remove missing observations indicated by NaN's, and +% ensure that valid observations remain. +% + +x1 = x1(~isnan(x1)); +x2 = x2(~isnan(x2)); +x1 = x1(:); +x2 = x2(:); + +if isempty(x1) + error('stats:kstest2:NotEnoughData', 'Sample vector X1 contains no data.'); +end + +if isempty(x2) + error('stats:kstest2:NotEnoughData', 'Sample vector X2 contains no data.'); +end + +% +% Ensure the significance level, ALPHA, is a scalar +% between 0 and 1 and set default if necessary. +% + +if (nargin >= 3) && ~isempty(alpha) + if ~isscalar(alpha) || (alpha <= 0 || alpha >= 1) + error('stats:kstest2:BadAlpha',... + 'Significance level ALPHA must be a scalar between 0 and 1.'); + end +else + alpha = 0.05; +end + +% +% Ensure the type-of-test indicator, TAIL, is a scalar integer from +% the allowable set, and set default if necessary. +% + +if (nargin >= 4) && ~isempty(tail) + if ischar(tail) + tail = strmatch(lower(tail), {'smaller','unequal','larger'}) - 2; + if isempty(tail) + error('stats:kstest2:BadTail',... + 'Type-of-test indicator TAIL must be ''unequal'', ''smaller'', or ''larger''.'); + end + elseif ~isscalar(tail) || ~((tail==-1) || (tail==0) || (tail==1)) + error('stats:kstest2:BadTail',... + 'Type-of-test indicator TAIL must be ''unequal'', ''smaller'', or ''larger''.'); + end +else + tail = 0; +end + +% +% Calculate F1(x) and F2(x), the empirical (i.e., sample) CDFs. +% + +binEdges = [-inf ; sort([x1;x2]) ; inf]; + +binCounts1 = histc (x1 , binEdges, 1); +binCounts2 = histc (x2 , binEdges, 1); + +sumCounts1 = cumsum(binCounts1)./sum(binCounts1); +sumCounts2 = cumsum(binCounts2)./sum(binCounts2); + +sampleCDF1 = sumCounts1(1:end-1); +sampleCDF2 = sumCounts2(1:end-1); + +% +% Compute the test statistic of interest. +% + +switch tail + case 0 % 2-sided test: T = max|F1(x) - F2(x)|. + deltaCDF = abs(sampleCDF1 - sampleCDF2); + + case -1 % 1-sided test: T = max[F2(x) - F1(x)]. + deltaCDF = sampleCDF2 - sampleCDF1; + + case 1 % 1-sided test: T = max[F1(x) - F2(x)]. + deltaCDF = sampleCDF1 - sampleCDF2; +end + +KSstatistic = max(deltaCDF); + +% +% Compute the asymptotic P-value approximation and accept or +% reject the null hypothesis on the basis of the P-value. +% + +n1 = length(x1); +n2 = length(x2); +n = n1 * n2 /(n1 + n2); +lambda = max((sqrt(n) + 0.12 + 0.11/sqrt(n)) * KSstatistic , 0); + +if tail ~= 0 % 1-sided test. + + pValue = exp(-2 * lambda * lambda); + +else % 2-sided test (default). +% +% Use the asymptotic Q-function to approximate the 2-sided P-value. +% + j = (1:101)'; + pValue = 2 * sum((-1).^(j-1).*exp(-2*lambda*lambda*j.^2)); + pValue = min(max(pValue, 0), 1); + +end + +H = (alpha >= pValue); diff --git a/matlab/general/logml2String.m b/matlab/general/logml2String.m new file mode 100644 index 0000000..edb9710 --- /dev/null +++ b/matlab/general/logml2String.m @@ -0,0 +1,67 @@ +function mjono = logml2String(logml) +% Palauttaa logml:n string-esityksen. + +mjono = ' '; + +if isequal(logml,-Inf) + mjono(7) = '-'; + return +end + +if abs(logml)<10000 + %Ei tarvita e-muotoa + mjono(7) = palautaYks(abs(logml),-1); + mjono(6) = '.'; + mjono(5) = palautaYks(abs(logml),0); + mjono(4) = palautaYks(abs(logml),1); + mjono(3) = palautaYks(abs(logml),2); + mjono(2) = palautaYks(abs(logml),3); + pointer = 2; + while mjono(pointer)=='0' & pointer<7 + mjono(pointer) = ' '; + pointer=pointer+1; + end + if logml<0 + mjono(pointer-1) = '-'; + end +else + suurinYks = 4; + while abs(logml)/(10^(suurinYks+1)) >= 1 + suurinYks = suurinYks+1; + end + if suurinYks<10 + mjono(7) = num2str(suurinYks); + mjono(6) = 'e'; + mjono(5) = palautaYks(abs(logml),suurinYks-1); + mjono(4) = '.'; + mjono(3) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(2) = '-'; + end + elseif suurinYks>=10 + mjono(6:7) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(logml),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(1) = '-'; + end + end +end + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:inä +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmillä +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); \ No newline at end of file diff --git a/matlab/general/noIndex.m b/matlab/general/noIndex.m new file mode 100644 index 0000000..6241f4e --- /dev/null +++ b/matlab/general/noIndex.m @@ -0,0 +1,11 @@ +function puredata = noIndex(data, noalle) +% NOINDEX Check that the data contains no index column. +% Input: two variables from a mixture/admixture result structure. +% Output: +% puredata: a data contains no index column. + +if size(data,2) == length(noalle) + 1 + puredata = data(:,[1:end-1]); % remove the index column +else + puredata = data; +end diff --git a/matlab/general/ownNum2Str.m b/matlab/general/ownNum2Str.m new file mode 100644 index 0000000..391b072 --- /dev/null +++ b/matlab/general/ownNum2Str.m @@ -0,0 +1,35 @@ +function str = ownNum2Str(number) + +absolute = abs(number); + +if absolute < 1000 + str = num2str(number); +elseif absolute < 10000000 + first_three = rem(number,1000); + next_four = (number - first_three) /1000; + first_three = abs(first_three); + if first_three<10 + first_three = ['00' num2str(first_three)]; + elseif first_three<100 + first_three = ['0' num2str(first_three)]; + else + first_three = num2str(first_three); + end; + str = [num2str(next_four) first_three]; +elseif absolute < 100000000 + first_four = rem(number,10000); + next_four = (number - first_four) /10000; + first_four = abs(first_four); + if first_four<10 + first_four = ['000' num2str(first_four)]; + elseif first_four<100 + first_four = ['00' num2str(first_four)]; + elseif first_four<1000 + first_four = ['0' num2str(first_four)]; + else + first_four = num2str(first_four); + end; + str = [num2str(next_four) first_four]; +else + str = num2str(number); +end; \ No newline at end of file diff --git a/matlab/general/setWindowOnTop.m b/matlab/general/setWindowOnTop.m new file mode 100644 index 0000000..aafafeb --- /dev/null +++ b/matlab/general/setWindowOnTop.m @@ -0,0 +1,124 @@ +function setWindowOnTop(h,state) +% SETWINDOWONTOP sets a figures Always On Top state on or off +% +% Copyright (C) 2006 Matt Whitaker +% +% This program is free software; you can redistribute it and/or modify it +% under +% the terms of the GNU General Public License as published by the Free +% Software Foundation; either version 2 of the License, or (at your +% option) any later version. +% +% This program is distributed in the hope that it will be useful, but +% WITHOUT ANY WARRANTY; without even the implied warranty of +% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +% General Public License for more details. +% +% SETWINDOWONTOP(H,STATE): H is figure handle or a vector of figure handles +% STATE is a string or cell array of strings- +% 'true' - set figure to be always on top +% 'false' - set figure to normal +% if STATE is a string the state is applied to +% all H. If state is a cell array the length STATE +% must equal that of H and each state is applied +% individually. +% Examples: +% h= figure; +% s = 'true'; +% setWindowOnTop(h,s) %sets h to be on top +% +% h(1) = figure; +% h(2) = figure; +% s = 'true'; +% setWindowOnTop(h,s) %sets both figures to be on top +% +% h(1) = figure; +% h(2) = figure; +% s = {'true','false'}; +% setWindowOnTop(h,s) %sets h(1) on top, h(2) normal +% Notes: +% 1. Figures must have 'Visible' set to 'on' and not be docked for +% setWindowOnTop to work. +% 2. Routine does not work for releases prior to R14SP2 +% 3. The Java calls are undocumented by Mathworks +% +% Revisions: 09/28/06- Corrected call to warning and uopdated for R2006b + +drawnow; %need to make sure that the figures have been rendered or Java error can occur + +% setWindowOnTop disabled to ensure that compiled version runs properly +% Modified: 11/06/07 Jukka Siren +return + +%check input argument number +error(nargchk(2, 2, nargin, 'struct')); + +%is JVM available +if ~usejava('jvm') + error('setWindowOnTop requires Java to run.'); +end + +[j,s] = parseInput; +setOnTop; %set the on top state + + function [j,s] = parseInput + % is h all figure handles + if ~all(ishandle(h)) || ~isequal(length(h),length(findobj(h,'flat','Type','figure'))) + error('All input handles must be valid figure handles'); + end %if + + %handle state argument + if ischar(state) + %make it a cell + s = cellstr(repmat(state,[length(h),1])); + + elseif iscellstr(state) + if length(state) ~= length(h) + error('Cell array of strings: state must be same length as figure handle input'); + end %if + s = state; + else + error('state must be a character array or a cell array of strings'); + end %if + + %check that the states are all valid + if ~all(ismember(s,{'true','false'})) + error('Invalid states entered') + end %if + + if length(h) == 1 + j{1} = get(h,'javaframe'); + else + j = get(h,'javaframe'); + end %if + + end %parseInput + + function setOnTop + %get version so we know which method to call + v = ver('matlab'); + %anticipating here that Mathworks will continue to change these + %undocumented calls + switch v(1).Release + case {'(R14SP2)','(R14SP3)'} + on_top = 1; + case {'(R2006a)','(R2006b)'} + on_top = 2; + otherwise %warn but try method 2 + % warning('setWindowOnTop:UntestedVersion',['setWindowOnTop has not been tested with release: ',v.Release]); + on_top = 2; + end %switch + for i = 1:length(j) + switch on_top + case 1 %R14SP2-3 + w = j{i}.fClientProxy.getFrameProxy.getClientFrame; + case 2 %R2006a+ + w= j{i}.fFigureClient.getWindow; + otherwise %should not happen + error('Invalid on top method'); + end %switch + awtinvoke(w,'setAlwaysOnTop',s{i}); + end %for j + end %setOnTop + +end %setWindowOnTop \ No newline at end of file diff --git a/matlab/general/seticon.m b/matlab/general/seticon.m new file mode 100644 index 0000000..1f02fe6 --- /dev/null +++ b/matlab/general/seticon.m @@ -0,0 +1,59 @@ +function seticon(window, useicon) +% Set icon for window. +% +% seticon(window, useicon) +% +% Parameters: +% window: Figure number or name of window +% useicon: Icon number to use or file name of icon +% =1 : Application +% =2 : Hand (x) +% =3 : Question (?) +% =4 : Exclamation /!\ +% =5 : Asterisk (i) +% =6 : Winlogo +% +% Examples: +% seticon(2,3) +% seticon(1, 'iconfile.ico') +% seticon('Microsoft Internet', 6) + +% Copyright 2000-2002, Research and Development + +if nargin~=2 + error('Two arguments required') +end + +if ~isstr(window) + window = wgetname(window); +end + +if ~any(window) + warning('Window specification insufficient') + return +end + +if isstr(useicon) + switch lower(useicon) + case 'application' + icon(1, window); + case 'hand' + icon(2, window); + case 'question' + icon(3, window); + case 'exclamation' + icon(4, window); + case 'asterisk' + icon(5, window); + case 'winlogo' + icon(6, window); + otherwise + icon(101, window, useicon); + end +else + if useicon>=1 & useicon<=6 + icon(useicon, window); + else + error('Icon number out of range') + end +end diff --git a/matlab/general/waitALittle.m b/matlab/general/waitALittle.m new file mode 100644 index 0000000..2d2571b --- /dev/null +++ b/matlab/general/waitALittle.m @@ -0,0 +1,3 @@ +function waitALittle +A = rand(500); +gammaln(A); \ No newline at end of file diff --git a/matlab/general/wgetname.m b/matlab/general/wgetname.m new file mode 100644 index 0000000..47dac3d --- /dev/null +++ b/matlab/general/wgetname.m @@ -0,0 +1,21 @@ +function s=wgetname(h) +% Returns the window name of a window +% +% s=wgetname(h) +% +% Parameters: +% h : window number +% +% See also: seticon + +% Copyright 2000-2002, Research and Development + +if strcmp(get(h,'numbertitle'), 'on') + if get(h,'name') + s = sprintf('Figure %d: %s', h, get(h,'name')); + else + s = sprintf('Figure %d', h); + end +else + s = get(h,'name'); +end diff --git a/matlab/general/winontop.mexw32 b/matlab/general/winontop.mexw32 new file mode 100644 index 0000000000000000000000000000000000000000..e6afebdbfa6894d8eb3c12d46c5114c95e3c5088 GIT binary patch literal 6144 zcmeHLe{3699e;Lj!D(-;mxj@y5U!QWBL1*qpN*zAk({R2c2c@UvD37LEOG2}Y|lPh z=es0mb%|t(>fKDr25L18q^@X_(3sFVi8756EumG$CIyKf4W%$(z)6YF1ZEA$%JF%3 zc6L=XgajHeCen(MW*w9Hgzy(vr4%sEugeS1U0nnwwl^~z!<7pv#qs6p$U zdK@86uV{-88_oFc-Jln`6WNshS1NFYHqe}JHl+1=nA<%JZ8>F!cF^O}A|7|yd4=}I zT=>`!Aa{RyE!l+DiyJ{2f4sbxPFUfz<1Q`Fmeve~+pl;$%Ev2#35~rBqb}_TTRL@_ zA?nYK$Ys~^kK;4+$dl4MJMze9=LXBahw;HQNt5ISm^M_G9zh=rW3jw4JXeE4X~ji# zv|&~gT(?7JjB2(ltZIV0EYvE(YPq+j6anoiZVE1J)0i2pm&FDbw5S}IS$@GcyL=zDlQzQoU)Jb8 zHoZ;1A3?%GBTg%iQjvA2cj#bNhaglmMDxSY5 z=7^cde08a~me7Q?={MK}Zpu=&c5}G8lwH$=>Ox&-r4_gFx=@89qQ|hgJXaomXL&gFe!t+;9gx~UUA z4u#UZt0b&4+F-5HwsSp}XSKoA>&!=~79D0I(K|aw>tRa6jhoyV9=awN4z1`e8RYb~ zlCh=OR4TfN(vrM@sFOd$Mfai;;YlmDHhjBHgt;|*<_5Ne3biCpth@cLWv5hhJJojT ze3H@6n;0A~-Vzp{(%azpsk6hzj;lqZWH^+MAq#{aShrEIY`}e*=+8nZ3sv}0LY3m% zLxx(_p9VHvG#u)#w1c*~q3VYEcG{q8k7I55-aGI$ip#)9XgP_o*V9AX=gt8D-(lke zHvRz{f6&GsvGIL2{32lh>-q@1MW@*5-GMBc9gFNEG(dFB8ic+!$rZP<&z@RTKQ*s#}t;R#FLvSGgj zh9@ei+OR4xJVF0s;{t4b(`N4_@YiheBKTi{R&5xA7+OGkK#k!C-vxRIbR0AW%77+7 zr$J{yPl6ibzv1^gBs5@h*kVf?{(j}(vKJMsX< zpn^PzFqKmJw?ao4d@l(!%BFFNdM=q(c?36d2$Um*$shy zzjuPVK%3*kT>;$hQDo@re4ud4v@%#O+`=|Lc`~ttk z-{x!l{y-@3T;Qd^)xhh4p9Ov!_*3BTfu>+fus!(c;Aesdf)52x1`|O!cslqv{ieb> zGXI96=P6_ztLOaR^1tH0>R 1 + disp(tr) +elseif n==0 + disp(' Empty array of phylogenetic tree objects') +else + n = length(tr.dist); + switch n + case 0 + disp(' Empty phylogenetic tree object') + case 1 + disp(' Phylogenetic tree object with 1 leaf (0 branches)') + case 3 + disp(' Phylogenetic tree object with 2 leaves (1 branch)') + otherwise + disp([' Phylogenetic tree object with ' num2str((n+1)/2) ... + ' leaves (' num2str((n-1)/2) ' branches)']) + end +end + diff --git a/matlab/graph/@phyTree/GET.m b/matlab/graph/@phyTree/GET.m new file mode 100644 index 0000000..89bd433 --- /dev/null +++ b/matlab/graph/@phyTree/GET.m @@ -0,0 +1,89 @@ +function varargout = get(tr,varargin) +%GET Get information about a phylogenetic tree object. +% [VALUE1,VALUE2, ...] = GET(TREE,'NAME1','NAME2', ...) returns the +% contents of the specified fields for the PHYTREE object TREE. +% +% The valid choices for 'NAME' are: +% 'POINTERS' : branch to leaf/branch connectivity list +% 'DISTANCES' : edge length for every leaf/branch +% 'NUMLEAVES' : number of leaves +% 'NUMBRANCHES' : number of branches +% 'NUMNODES' : number of nodes (numleaves + numbranches) +% 'LEAFNAMES' : names of the leaves +% 'BRANCHNAMES' : names of the branches +% 'NODENAMES' : names of all the nodes +% +% GET(TREE) displays all property names and their current values for +% the PHYTREE object TREE. +% +% V = GET(TREE) returns a structure where each field name is the name of +% a property of TREE and each field contains the value of that property. +% +% Examples: +% tr = phytreeread('pf00002.tree') +% protein_names = get(tr,'LeafNames') +% +% See also PHYTREE, PHYTREEREAD, PHYTREE/SELECT, PHYTREE/GETBYNAME. + +% Copyright 2003-2005 The MathWorks, Inc. +% $Revision: 1.1.6.7 $ $Author: batserve $ $Date: 2005/06/09 21:55:54 $ + +if numel(tr)~=1 + error('Bioinfo:phytree:get:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end + +numBranches = size(tr.tree,1); +numLeaves = numBranches + 1; +numLabels = numBranches + numLeaves; + +% get input without arguments displays a summary +if nargin == 1 + s.NumLeaves = numLeaves; + s.NumBranches = numBranches; + s.NumNodes = numLabels; + s.Pointers = tr.tree; + s.Distances = tr.dist; + s.LeafNames = tr.names(1:numLeaves); + s.BranchNames = tr.names(numLeaves+1:numLabels); + s.NodeNames = tr.names; + if nargout == 0 + disp(s) + else + varargout{1} = s; + end + return; +end + +okargs = {'pointers','distances','numleaves','numbranches',... + 'numnodes','leafnames','branchnames','nodenames'}; +for ind = 2 : nargin + pname = varargin{ind-1}; + k = find(strncmpi(pname,okargs,numel(pname))); + if isempty(k) + error('Bioinfo:phytree:get:UnknownParameterName',... + 'Unknown parameter name: %s.',pname); + elseif length(k)>1 + error('Bioinfo:phytree:get:AmbiguousParameterName',... + 'Ambiguous parameter name: %s.',pname); + else + switch(k) + case 1 % pointers + varargout{ind-1} = tr.tree; %#ok + case 2 % distances + varargout{ind-1} = tr.dist; + case 3 % numleaves + varargout{ind-1} = numLeaves; + case 4 % numbranches + varargout{ind-1} = numBranches; + case 5 % numNodes + varargout{ind-1} = numLabels; + case 6 % leafnames + varargout{ind-1} = tr.names(1:numLeaves); + case 7 % branchnames + varargout{ind-1} = tr.names(numLeaves+1:numLabels); + case 8 % nodenames + varargout{ind-1} = tr.names; + end + end +end diff --git a/matlab/graph/@phyTree/Plot.m b/matlab/graph/@phyTree/Plot.m new file mode 100644 index 0000000..18a553d --- /dev/null +++ b/matlab/graph/@phyTree/Plot.m @@ -0,0 +1,629 @@ +function handles = Plot(tr,varargin) +%PLOT renders a phylogenetic tree. +% +% PLOT(TREE) renders a phylogenetic tree object into a MATLAB figure as a +% phylogram. The significant distances between branches and nodes are in +% horizontal direction, vertical coordinates are accommodated only for +% display purposes. Handles to graph elements are stored in the +% 'UserData' figure field, such that graphic properties can be easily +% modified. +% +% PLOT(TREE,ACTIVEBRANCHES) hides the non'active branches and all their +% descendants. ACTIVEBRANCHES is a logical array of size +% [numBranches x 1] indicating the active branches. +% +% PLOT(...,'TYPE',type) selects the method to render the phylogenetic +% tree. Options are: 'square' (default), 'angular', and 'radial'. +% +% PLOT(...,'ORIENTATION',orient) will orient the phylogenetic tree within +% the figure window. Options are: 'top', 'bottom', 'left' (default), and, +% 'right'. Orientation parameter is valid only for phylograms or +% cladograms. +% +% PLOT(...,'BRANCHLABELS',value) hides/unhides branch labels. Options are +% true or false. Branch labels are placed next to the branch node. +% Defaults to false (true) when TYPE is (is not) 'radial'. +% +% PLOT(...,'LEAFLABELS',value) hides/unhides leaf labels. Options are +% true or false. Leaf labels are placed next to the leaf nodes. Defaults +% to false (true) when TYPE is (is not) 'radial'. +% +% PLOT(...,'TERMINALLABELS',value) hides/unhides terminal labels. Options +% are true (default) or false. Terminal labels are placed over the axis +% tick labels, ignored when 'radial' type is used. +% +% H = PLOT(...) returns a structure with handles to the graph elements. +% +% Example: +% +% tr = phytreeread('pf00002.tree') +% plot(tr,'type','radial') +% +% % Graph element properties can be modified as follows: +% +% h=get(gcf,'UserData') +% set(h.branchNodeLabels,'FontSize',6,'Color',[.5 .5 .5]) +% +% See also PHYTREE, PHYTREE/VIEW, PHYTREEREAD, PHYTREETOOL, SEQLINKAGE. + +% Copyright 2003-2006 The MathWorks, Inc. +% $Revision: 1.1.6.10 $ $Author: batserve $ $Date: 2006/06/16 20:06:45 $ + +if numel(tr)~=1 + error('Bioinfo:phytree:plot:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end + +% set defaults +dispBranchLabels = NaN; +dispLeafLabels = NaN; +dispTerminalLabels = true; +renderType = 'square'; +orientation = 'left'; +rotation = 0; + +tr = struct(tr); +tr.numBranches = size(tr.tree,1); + +if nargin>1 && islogical(varargin{1}) + activeBranches = varargin{1}; + argStart = 2; +else + activeBranches = true(tr.numBranches,1); + argStart = 1; +end + +if nargin - argStart > 0 + if rem(nargin - argStart,2) == 1 + error('Bioinfo:phytree:plot:IncorrectNumberOfArguments',... + 'Incorrect number of arguments to %s.',mfilename); + end + okargs = {'type','orientation','rotation',... + 'branchlabels','leaflabels','terminallabels'}; + for j = argStart:2:nargin-argStart + pname = varargin{j}; + pval = varargin{j+1}; + k = find(strncmpi(pname,okargs,numel(pname))); + if isempty(k) + error('Bioinfo:phytree:plot:UnknownParameterName',... + 'Unknown parameter name: %s.',pname); + elseif length(k)>1 + error('Bioinfo:phytree:plot:AmbiguousParameterName',... + 'Ambiguous parameter name: %s.',pname); + else + switch(k) + case 1 % type + oktypes={'square','angular','radial'}; + l = strmatch(lower(pval),oktypes); %#ok + if isempty(l) + error('Bioinfo:phytree:plot:UnknownTypeName',... + 'Unknown option for %s.',upper(okargs{k})); + else + if l==4 + l=1; + end + renderType = oktypes{l}; + end + case 2 % orientation + oktypes={'left','right','top','bottom'}; + l = strmatch(lower(pval),oktypes); %#ok + if isempty(l) + error('Bioinfo:phytree:plot:UnknownOrientation',... + 'Unknown option for %s.',upper(okargs{k})); + else + orientation = oktypes{l}; + end + case 3 % rotation + if isreal(pval(1)) + rotation = double(pval(1)); + else + error('Bioinfo:phytree:plot:NotValidType',... + 'ROTATION must be numeric and real'); + end + case 4 % branch labels + dispBranchLabels = opttf(pval); + case 5 % leaf labels + dispLeafLabels = opttf(pval); + case 6 % terminal labels + dispTerminalLabels = opttf(pval); + end + end + end +end + +% set dependent defaults +if isnan(dispBranchLabels) + if isequal(renderType,'radial') + dispBranchLabels = true; + else + dispBranchLabels = false; + end +end +if isnan(dispLeafLabels) + if isequal(renderType,'radial') + dispLeafLabels = true; + else + dispLeafLabels = false; + end +end + +tr = doBasicCalculations(tr,activeBranches,renderType); + +nodeIndex = 1:tr.numLabels; +leafIndex = 1:tr.numLeaves; +branchIndex = tr.numLeaves+1:tr.numLabels; + + +% check empty names +for ind = nodeIndex + if isempty(tr.names{ind}) + if ind > tr.numLeaves + tr.names{ind} = ['Branch ' num2str(ind-tr.numLeaves)]; + else + tr.names{ind} = ['Leaf ' num2str(ind)]; + end + end +end + +% rendering graphic objects +fig = gcf; +% fig = figure('Renderer','ZBuffer'); +h.fig = fig; +h.axes = axes; hold on; +sepUnit = max(tr.x)*[-1/20 21/20]; + +% setting the axes +switch renderType + case {'square','angular'} + switch orientation + case 'left' + set(h.axes,'YTick',1:numel(tr.terminalNodes),'Ydir','reverse',... + 'YtickLabel','','YAxisLocation','Right') + if dispTerminalLabels + set(h.axes,'Position',[.05 .10 .7 .85]) + else + set(h.axes,'Position',[.05 .10 .9 .85]) + end + xlim(sepUnit); + ylim([0 numel(tr.terminalNodes)+1]); + case 'right' + set(h.axes,'YTick',1:numel(tr.terminalNodes),'Xdir','reverse','Ydir','reverse',... + 'YtickLabel','','YAxisLocation','Left') + if dispTerminalLabels + set(h.axes,'Position',[.25 .10 .7 .85]) + else + set(h.axes,'Position',[.05 .10 .9 .85]) + end + xlim(sepUnit); + ylim([0 numel(tr.terminalNodes)+1]); + case 'top' + set(h.axes,'XTick',1:numel(tr.terminalNodes),... + 'XtickLabel','','XAxisLocation','Top') + if dispTerminalLabels + set(h.axes,'Position',[.10 .05 .85 .7]) + else + set(h.axes,'Position',[.10 .05 .85 .9]) + end + ylim(sepUnit); + xlim([0 numel(tr.terminalNodes)+1]); + case 'bottom' + set(h.axes,'XTick',1:numel(tr.terminalNodes),'Ydir','reverse',... + 'XtickLabel','','XAxisLocation','Bottom') + if dispTerminalLabels + set(h.axes,'Position',[.10 .25 .85 .7]) + else + set(h.axes,'Position',[.10 .05 .85 .9]) + end + ylim(sepUnit); + xlim([0 numel(tr.terminalNodes)+1]); + end + case 'radial' + set(h.axes,'XTick',[],'YTick',[]) + set(h.axes,'Position',[.05 .05 .9 .9]) + dispTerminalLabels = false; + axis equal +end + +% drawing lines +switch renderType + case 'square' + X = tr.x([nodeIndex;repmat([tr.par(1:tr.numLabels-1) tr.numLabels],2,1)]); + Y = tr.y([repmat(nodeIndex,2,1);[tr.par(1:tr.numLabels-1) tr.numLabels]]); + switch orientation + case {'left','right'} + h.BranchLines = plot(X,Y,'-k'); + delete(h.BranchLines(~tr.activeNodes)) + h.BranchLines = h.BranchLines(tr.activeNodes); + case {'top','bottom'} + h.BranchLines = plot(Y,X,'-k'); + delete(h.BranchLines(~tr.activeNodes)) + h.BranchLines = h.BranchLines(tr.activeNodes); + end + case 'angular' + X = tr.x([nodeIndex;[tr.par(1:tr.numLabels-1) tr.numLabels]]); + Y = tr.y([nodeIndex;[tr.par(1:tr.numLabels-1) tr.numLabels]]); + switch orientation + case {'left','right'} + h.BranchLines = plot(X,Y,'-k'); + delete(h.BranchLines(~tr.activeNodes)) + h.BranchLines = h.BranchLines(tr.activeNodes); + case {'top','bottom'} + h.BranchLines = plot(Y,X,'-k'); + delete(h.BranchLines(~tr.activeNodes)) + h.BranchLines = h.BranchLines(tr.activeNodes); + end + case 'radial' + R = tr.x; + A = tr.y / numel(tr.terminalNodes)*2*pi+rotation*pi/180; + tr.x = R .* sin(A); + tr.y = R .* cos(A); + X = tr.x([nodeIndex;[tr.par(1:tr.numLabels-1) tr.numLabels]]); + Y = tr.y([nodeIndex;[tr.par(1:tr.numLabels-1) tr.numLabels]]); + h.BranchLines = plot(X,Y,'-k'); + delete(h.BranchLines(~tr.activeNodes)) + h.BranchLines = h.BranchLines(tr.activeNodes); +end + +% drawing nodes +switch renderType + case {'square','angular'} + switch orientation + case {'left','right'} + h.BranchDots = plot(tr.x(branchIndex(tr.activeNodes(branchIndex))),... + tr.y(branchIndex(tr.activeNodes(branchIndex))),'o',... + 'MarkerSize',5,'MarkerEdgeColor','k',... + 'MarkerFaceColor','b'); + h.LeafDots = plot(tr.x(leafIndex(tr.activeNodes(leafIndex))),... + tr.y(leafIndex(tr.activeNodes(leafIndex))),'square',... + 'MarkerSize',4,'MarkerEdgeColor','k',... + 'MarkerFaceColor','w'); + case {'top','bottom'} + h.BranchDots = plot(tr.y(branchIndex(tr.activeNodes(branchIndex))),... + tr.x(branchIndex(tr.activeNodes(branchIndex))),'o',... + 'MarkerSize',5,'MarkerEdgeColor','k',... + 'MarkerFaceColor','b'); + h.LeafDots = plot(tr.y(leafIndex(tr.activeNodes(leafIndex))),... + tr.x(leafIndex(tr.activeNodes(leafIndex))),'square',... + 'MarkerSize',4,'MarkerEdgeColor','k',... + 'MarkerFaceColor','w'); + end + case 'radial' + h.BranchDots = plot(tr.x(branchIndex(tr.activeNodes(branchIndex))),... + tr.y(branchIndex(tr.activeNodes(branchIndex))),'o',... + 'MarkerSize',5,'MarkerEdgeColor','k',... + 'MarkerFaceColor','b'); + h.LeafDots = plot(tr.x(leafIndex(tr.activeNodes(leafIndex))),... + tr.y(leafIndex(tr.activeNodes(leafIndex))),'square',... + 'MarkerSize',4,'MarkerEdgeColor','k',... + 'MarkerFaceColor','w'); +end + +% resize figure if needed +switch renderType + case {'square','angular'} + switch orientation + case {'left','right'} + correctFigureSize(fig, 15 * numel(tr.terminalNodes),0); + fontRatio = max(get(fig,'Position').*[0 0 0 1])/numel(tr.terminalNodes); + case {'top','bottom'} + correctFigureSize(fig, 0, 15 * numel(tr.terminalNodes)); + fontRatio = max(get(fig,'Position').*[0 0 1 0])/numel(tr.terminalNodes); + end + case 'radial' + temp = 10/pi*numel(tr.terminalNodes); + correctFigureSize(fig,temp,temp); + fontRatio = max(get(fig,'Position').*[0 0 1 0])/numel(tr.terminalNodes); +end + +set(h.axes,'Fontsize',min(9,ceil(fontRatio/1.5))); + +% set branch node labels +X = tr.x(branchIndex(tr.activeNodes(tr.numLeaves+1:tr.numLabels))); +Y = tr.y(branchIndex(tr.activeNodes(tr.numLeaves+1:tr.numLabels))); +switch renderType + case {'square','angular'} + switch orientation + case {'left'} + h.branchNodeLabels = text(X+sepUnit(1)/2,Y,tr.names(branchIndex(tr.activeNodes(tr.numLeaves+1:tr.numLabels)))); + set(h.branchNodeLabels,'color',[0 0 .8],'clipping','on') + set(h.branchNodeLabels,'vertical','bottom') + set(h.branchNodeLabels,'horizontal','right') + set(h.branchNodeLabels,'Fontsize',min(8,ceil(fontRatio/2))); + case {'right'} + h.branchNodeLabels = text(X+sepUnit(1)/2,Y,tr.names(branchIndex(tr.activeNodes(tr.numLeaves+1:tr.numLabels)))); + set(h.branchNodeLabels,'color',[0 0 .8],'clipping','on') + set(h.branchNodeLabels,'vertical','bottom') + set(h.branchNodeLabels,'Fontsize',min(8,ceil(fontRatio/2))); + case {'top'} + h.branchNodeLabels = text(Y,X-sepUnit(1)/2,tr.names(branchIndex(tr.activeNodes(tr.numLeaves+1:tr.numLabels)))); + set(h.branchNodeLabels,'color',[0 0 .8],'clipping','on') + set(h.branchNodeLabels,'vertical','bottom','Rotation',30) + set(h.branchNodeLabels,'Fontsize',min(8,ceil(fontRatio/2))); + case {'bottom'} + h.branchNodeLabels = text(Y,X+sepUnit(1)/2,tr.names(branchIndex(tr.activeNodes(tr.numLeaves+1:tr.numLabels)))); + set(h.branchNodeLabels,'color',[0 0 .8],'clipping','on') + set(h.branchNodeLabels,'vertical','bottom','Rotation',30) + set(h.branchNodeLabels,'Fontsize',min(8,ceil(fontRatio/2))); + end + case 'radial' + h.branchNodeLabels = text(X,Y,tr.names(branchIndex(tr.activeNodes(tr.numLeaves+1:tr.numLabels)))); + set(h.branchNodeLabels,'color',[0 0 .8],'clipping','on') + set(h.branchNodeLabels,'vertical','bottom') + set(h.branchNodeLabels,'Fontsize',min(8,ceil(fontRatio*1.2))); + for ind = 1:numel(h.branchNodeLabels) + if X(ind)<0 + set(h.branchNodeLabels(ind),'horizontal','right') + set(h.branchNodeLabels(ind),'Position',get(h.branchNodeLabels(ind),'Position')+[sepUnit(1)/2 0 0]) + else + set(h.branchNodeLabels(ind),'horizontal','left') + set(h.branchNodeLabels(ind),'Position',get(h.branchNodeLabels(ind),'Position')-[sepUnit(1)/2 0 0]) + end + end +end + +% set leaf nodes labels +X = tr.x(leafIndex(tr.activeNodes(1:tr.numLeaves))); +Y = tr.y(leafIndex(tr.activeNodes(1:tr.numLeaves))); +switch renderType + case {'square','angular'} + switch orientation + case {'left'} + h.leafNodeLabels = text(X-sepUnit(1)/2,Y,tr.names(leafIndex(tr.activeNodes(1:tr.numLeaves)))); + set(h.leafNodeLabels,'color',[.5 .5 .5],'clipping','on') + set(h.leafNodeLabels,'horizontal','left') + set(h.leafNodeLabels,'Fontsize',min(8,ceil(fontRatio/2))); + case {'right'} + h.leafNodeLabels = text(X-sepUnit(1)/2,Y,tr.names(leafIndex(tr.activeNodes(1:tr.numLeaves)))); + set(h.leafNodeLabels,'color',[.5 .5 .5],'clipping','on') + set(h.leafNodeLabels,'horizontal','right') + set(h.leafNodeLabels,'Fontsize',min(8,ceil(fontRatio/2))); + case {'top'} + h.leafNodeLabels = text(Y,X-sepUnit(1)/2,tr.names(leafIndex(tr.activeNodes(1:tr.numLeaves)))); + set(h.leafNodeLabels,'color',[.5 .5 .5],'clipping','on') + set(h.leafNodeLabels,'horizontal','left','Rotation',60) + set(h.leafNodeLabels,'Fontsize',min(8,ceil(fontRatio/2))); + case {'bottom'} + h.leafNodeLabels = text(Y,X-sepUnit(1),tr.names(leafIndex(tr.activeNodes(1:tr.numLeaves)))); + set(h.leafNodeLabels,'color',[.5 .5 .5],'clipping','on') + set(h.leafNodeLabels,'horizontal','right','Rotation',60) + set(h.leafNodeLabels,'Fontsize',min(8,ceil(fontRatio/2))); + end + case 'radial' + h.leafNodeLabels = text(X,Y,tr.names(leafIndex(tr.activeNodes(1:tr.numLeaves)))); + set(h.leafNodeLabels,'color',[.5 .5 .5],'clipping','on') + set(h.leafNodeLabels,'Fontsize',min(8,ceil(fontRatio*1.2))); + % textHeight = mean(cell2mat(get(h.leafNodeLabels,'Extent')))*[0 0 0 1]'; + for ind = 1:numel(h.leafNodeLabels) + if X(ind)<0 + set(h.leafNodeLabels(ind),'horizontal','right') + set(h.leafNodeLabels(ind),'Position',get(h.leafNodeLabels(ind),'Position')+[sepUnit(1) 0 0]) + else + set(h.leafNodeLabels(ind),'horizontal','left') + set(h.leafNodeLabels(ind),'Position',get(h.leafNodeLabels(ind),'Position')-[sepUnit(1) 0 0]) + end + % a=atan(Y(ind)/X(ind))*180/pi; + % if a > 0 a = max(0,a-60)/2; else + % a = min(0,a+60)/2; end + % set(h.leafNodeLabels(ind),'Rotation',a) + end + [sortedY,hsY]=sort(Y); + idx=hsY(X(hsY)>0 & sortedY>0); + if numel(idx) + extentY = get(h.leafNodeLabels(idx(1)),'Extent')*[0;0;0;1]; + positionY = get(h.leafNodeLabels(idx(1)),'Position')*[0;1;0]; + for i = 2:numel(idx) + position = get(h.leafNodeLabels(idx(i)),'Position'); + positionY = max(positionY+extentY,position(2)); + position(2) = positionY; + set(h.leafNodeLabels(idx(i)),'Position',position) + end + end + idx=hsY(X(hsY)<0 & sortedY>0); + if numel(idx) + extentY = get(h.leafNodeLabels(idx(1)),'Extent')*[0;0;0;1]; + positionY = get(h.leafNodeLabels(idx(1)),'Position')*[0;1;0]; + for i = 2:numel(idx) + position = get(h.leafNodeLabels(idx(i)),'Position'); + positionY = max(positionY+extentY,position(2)); + position(2) = positionY; + set(h.leafNodeLabels(idx(i)),'Position',position) + end + end + idx=flipud(hsY(X(hsY)>0 & sortedY<0)); + if numel(idx) + extentY = get(h.leafNodeLabels(idx(1)),'Extent')*[0;0;0;1]; + positionY = get(h.leafNodeLabels(idx(1)),'Position')*[0;1;0]; + for i = 2:numel(idx) + position = get(h.leafNodeLabels(idx(i)),'Position'); + positionY = min(positionY-extentY,position(2)); + position(2) = positionY; + set(h.leafNodeLabels(idx(i)),'Position',position) + end + end + idx=flipud(hsY(X(hsY)<0 & sortedY<0)); + if numel(idx) + extentY = get(h.leafNodeLabels(idx(1)),'Extent')*[0;0;0;1]; + positionY = get(h.leafNodeLabels(idx(1)),'Position')*[0;1;0]; + for i = 2:numel(idx) + position = get(h.leafNodeLabels(idx(i)),'Position'); + positionY = min(positionY-extentY,position(2)); + position(2) = positionY; + set(h.leafNodeLabels(idx(i)),'Position',position) + end + end + +end + +% correct axis limits given the extent of labels +if dispBranchLabels + E = cell2mat(get(h.branchNodeLabels,'Extent')); + if strcmp(get(gca,'XDir'),'reverse') + E(:,1) = E(:,1) - E(:,3); + end + if strcmp(get(gca,'YDir'),'reverse') + E(:,2) = E(:,2) - E(:,4); + end + E=[E;[xlim*[1;0] ylim*[1;0] diff(xlim) diff(ylim)]]; + mins = min(E(:,[1 2])); + maxs = max([sum(E(:,[1 3]),2) sum(E(:,[2 4]),2)]); + axis([mins(1) maxs(1) mins(2) maxs(2)]) +end + +if dispLeafLabels + E = cell2mat(get(h.leafNodeLabels,'Extent')); + if strcmp(get(gca,'XDir'),'reverse') + E(:,1) = E(:,1) - E(:,3); + end + if strcmp(get(gca,'YDir'),'reverse') + E(:,2) = E(:,2) - E(:,4); + end + E=[E;[xlim*[1;0] ylim*[1;0] diff(xlim) diff(ylim)]]; + mins = min(E(:,[1 2])); + maxs = max([sum(E(:,[1 3]),2) sum(E(:,[2 4]),2)]); + axis([mins(1) maxs(1) mins(2) maxs(2)]) +end + +% set terminal nodes labels +switch renderType + case {'square','angular'} + X = tr.x(tr.terminalNodes) * 0; + Y = tr.y(tr.terminalNodes); + switch orientation + case {'left'} + X = X + max(xlim) - sepUnit(1)/2; + h.terminalNodeLabels = text(X,Y,tr.names(tr.terminalNodes)); + case {'right'} + X = X + max(xlim) - sepUnit(1)/2; + h.terminalNodeLabels = text(X,Y,tr.names(tr.terminalNodes)); + set(h.terminalNodeLabels,'Horizontal','right') + case {'top'} + X = X + max(ylim) - sepUnit(1)/2; + h.terminalNodeLabels = text(Y,X,tr.names(tr.terminalNodes)); + set(h.terminalNodeLabels,'Rotation',90) + case {'bottom'} + X = X + max(ylim) - sepUnit(1)/2; + h.terminalNodeLabels = text(Y,X,tr.names(tr.terminalNodes)); + set(h.terminalNodeLabels,'Rotation',270) + end + case 'radial' + h.terminalNodeLabels = text(0,0,' '); +end + +if dispTerminalLabels + set(h.terminalNodeLabels,'Fontsize',min(9,ceil(fontRatio/1.5))); +end + +if ~dispBranchLabels + set(h.branchNodeLabels,'visible','off'); +end +if ~dispLeafLabels + set(h.leafNodeLabels,'visible','off'); +end +if ~dispTerminalLabels + set(h.terminalNodeLabels,'visible','off'); +end + +box on +hold off + +% store handles +set(fig,'UserData',h) +if nargout + handles = h; +end + +% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function tr = doBasicCalculations(tr,activeBranches,renderType) + +% helper function to compute and find some features of the tree +tr.numLeaves = tr.numBranches + 1; +tr.numLabels = tr.numBranches + tr.numLeaves; + +% remove uderscores from names +for ind = 1:tr.numLabels + tr.names{ind}(tr.names{ind}=='_')=' '; +end + +% obtain parents for every node +tr.par(tr.tree(:)) = tr.numLeaves + [1:tr.numBranches 1:tr.numBranches]; + +% find active nodes +tr.activeNodes = true(tr.numLabels,1); +for ind =tr.numBranches:-1:1 + tr.activeNodes(tr.tree(ind,:)) = tr.activeNodes(tr.numLeaves+ind) & activeBranches(ind); +end + +% propagate last leaf +tr.lastleaf = 1:tr.numLabels; +for ind = tr.numBranches:-1:1 + if ~tr.activeNodes(tr.tree(ind,1)) + tr.lastleaf(tr.tree(ind,:))=tr.lastleaf(ind+tr.numLeaves); + end +end + +tr.activeBranches = tr.activeNodes(tr.numLeaves+1:tr.numLabels)&activeBranches; +tr.activeLeaves = tr.activeNodes(1:tr.numLeaves); + +% find x coordinates of branches +tr.x = tr.dist; +for ind = tr.numBranches:-1:1 + tr.x(tr.tree(ind,:)) = tr.x(tr.tree(ind,:)) + tr.x(ind+tr.numLeaves); +end + +% find y coordinates of branches +tr.terminalNodes = tr.lastleaf([true,diff(tr.lastleaf(1:tr.numLeaves))~=0]); +tr.y=zeros(tr.numLabels,1); +tr.y(tr.terminalNodes)=1:length(tr.terminalNodes); +switch renderType + case 'square' + for ind = 1:tr.numBranches + if tr.activeBranches(ind) + tr.y(ind+tr.numLeaves) = mean(tr.y(tr.tree(ind,:))); + end + end + case {'angular','radial'} + for ind = 1:tr.numBranches + if tr.activeBranches(ind) + if tr.x(tr.tree(ind,1))/tr.x(tr.tree(ind,2))>3 + tr.y(ind+tr.numLeaves) = tr.y(tr.tree(ind,1)); + elseif tr.x(tr.tree(ind,2))/tr.x(tr.tree(ind,1))>3 + tr.y(ind+tr.numLeaves) = tr.y(tr.tree(ind,2)); + else + tr.y(ind+tr.numLeaves) = mean(tr.y(tr.tree(ind,:))); + end + end + end +end + +% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function correctFigureSize(fig,recommendedHeight,recommendedWidth) +% helper function to increase initial figure size depending on the screen & +% tree sizes +screenSize = diff(reshape(get(0,'ScreenSize'),2,2),[],2)-[0;100]; +% 100 gives extra space for the figure header and win toolbar +position = get(fig,'Position'); +if recommendedHeight > position(4) + if recommendedHeight < sum(position([2 4])) + position(2) = sum(position([2 4])) - recommendedHeight; + position(4) = recommendedHeight; + elseif recommendedHeight < screenSize(2) + position(2) = 30; + position(4) = recommendedHeight; + else + position(2) = 30; + position(4) = screenSize(2); + end +end +if recommendedWidth > position(3) + if recommendedWidth < sum(position([1 3])) + position(1) = sum(position([1 3])) - recommendedWidth; + position(3) = recommendedWidth; + elseif recommendedWidth < screenSize(1) + position(1) = 0; + position(3) = recommendedHeight; + else + position(1) = 0; + position(3) = screenSize(1); + end +end +set(fig,'Position',position) diff --git a/matlab/graph/@phyTree/Prune.m b/matlab/graph/@phyTree/Prune.m new file mode 100644 index 0000000..fdd26a1 --- /dev/null +++ b/matlab/graph/@phyTree/Prune.m @@ -0,0 +1,197 @@ +function tr = prune(tr,sel,varargin) +%PRUNE Reduces a phylogenetic tree by removing branch and leaf nodes. +% T2 = PRUNE(T1,NODES) prunes the nodes listed in the NODES vector from +% the tree T1. Any branch (or leaf) node listed in NODES and all their +% descendants will disappear. The respective 'parent' nodes will be +% connected to the respective 'brother' nodes as required. NODES in the +% tree are indexed as [1:NUMLEAVES] for the leaves and as +% [NUMLEAVES+1 : NUMLEAVES+NUMBRANCHES] for the branches. NODES can also +% be a logical array of following sizes: [NUMLEAVES+NUMBRANCHES x 1], +% [NUMLEAVES x 1] or [NUMBRANCHES x 1]. +% +% T2 = PRUNE(T1,NODES,'MODE','EXCLUSIVE') changes the pruning mode to +% 'EXCLUSIVE', i.e. only the descendants of NODES will be pruned. Then +% NODES will become leaves as long as they do not have a predecessor in +% the list NODES. In this case pruning is the process of reducing a tree +% by turning some branch nodes into leaf nodes, and removing the leaf +% nodes under the original branch. Default is 'INCLUSIVE' and it behaves +% as explained above, i.e. the listed NODES are also pruned. +% +% Examples: +% +% % Load a phylogenetic tree created from a protein family: +% tr = phytreeread('pf00002.tree'); +% view(tr) +% +% % To remove all the 'mouse' proteins use: +% ind = getbyname(tr,'mouse'); +% tr = prune(tr,ind); +% view(tr) +% +% % To remove potential outliers in the tree use: +% [sel,sel_leaves] = select(tr,'criteria','distance','threshold',.3,... +% 'reference','leaves','exclude','leaves','propagate','toleaves'); +% tr = prune(tr,~sel_leaves) +% view(tr) +% +% See also PHYTREE, PHYTREE/SELECT, PHYTREE/GET, PHYTREETOOL. + +% Copyright 2003-2005 The MathWorks, Inc. +% $Revision: 1.1.4.8.2.1 $ $Date: 2004/11/30 03:45:24 $ + +% set default +exclusiveMode = false; + +if numel(tr)~=1 + error('Bioinfo:phytree:prune:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end + +btr = tr; +numBranches = size(tr.tree,1); +numLeaves = numBranches + 1; +numLabels = numBranches + numLeaves; + +% validate sel +if islogical(sel) + if numel(sel)==numLabels + sel = sel(:)==true; + elseif numel(sel)==numLeaves + sel = [sel(:);false(numBranches,1)]; + elseif numel(sel)==numBranches + sel = [false(numLeaves,1);sel(:)]; + else + error('Bioinfo:IncorrectLogical',... + 'Logical vector must have the same number of elements as nodes in the Phylogenetic Tree'); + end +elseif isnumeric(sel) && isreal(sel) && all(sel>=1) && all(sel<=numLabels) + tem(numLabels)=false; + tem(floor(sel))=true; + sel=tem(:); +else + error('Bioinfo:IncorrectTypeofArguments','Invalid value for NODES'); +end + +nvarargin = numel(varargin); +if nvarargin + if rem(nvarargin,2) + error('Bioinfo:IncorrectNumberOfArguments',... + 'Incorrect number of arguments to %s.',mfilename); + end + okargs = {'mode',''}; + for j=1:2:nvarargin + pname = varargin{j}; + pval = varargin{j+1}; + k = find(strncmpi(pname,okargs,numel(pname))); + if isempty(k) + error('Bioinfo:UnknownParameterName',... + 'Unknown parameter name: %s.',pname); + elseif length(k)>1 + error('Bioinfo:AmbiguousParameterName',... + 'Ambiguous parameter name: %s.',pname); + else + switch(k) + case 1 % classifiers + modeOptions = {'exclusive','inclusive'}; + modeSelected = strmatch(lower(pval),modeOptions); %#ok + if isempty(modeSelected) + error('Bioinfo:NotValidMode','Not a valid mode.') + end + exclusiveMode = modeSelected==1; + end + end + end +end + +% shortcut for an empty sel +if ~sum(sel) + return; +end + +% when inclusiveMode if the two chidren of a branch are selected then the +% parent node should also be selected +if ~exclusiveMode + for ind = 1:numBranches + if all(sel(tr.tree(ind,:))) + sel(ind+numLeaves) = true; + end + end +end + +% find descendants not selected under selected nodes +for ind = numBranches:-1:1 + if sel(ind+numLeaves) + sel(tr.tree(ind,:))=true; + end +end + + if sel(numLabels) + warning('Bioinfo:PrunedRoot',... + 'Can not prune the root node in a Phylogenetic Tree.') + tr=btr; return + end + +% obtain parents for every node +parents(tr.tree(:)) = repmat(numLeaves+1:numLabels,2,1)'; + +if ~exclusiveMode % (the selected nodes are deleted with their descendants) + % find the top selected nodes in order to edit branches + htop = find(~[sel(parents);0]&sel); + % for every top node do the junction + for ind = 1:length(htop) + g=htop(ind); + mypar = parents(g); + if mypar < numLabels % my parent is NOT the root + % then connect brother to granparent + mygrpar = parents(mypar); % grandparent + myuncle = setxor(tr.tree(mygrpar-numLeaves,:),mypar); % uncle + mybro = setxor(tr.tree(mypar-numLeaves,:),g); % brother + tr.tree(mygrpar-numLeaves,:) = [myuncle mybro]; + tr.dist(mybro) = tr.dist(mybro) + tr.dist(mypar); + parents(mybro) = mygrpar; + end + sel(mypar) = true; %also delete my par + + end + if sum(~sel) == 1 + warning('Bioinfo:NotAMinimumTree',... + 'The selected nodes lead to only one leaf, Phylogenetic Tree not pruned.') + tr=btr; return + end + % find indexes to change tree + permuta = 1:numLabels; + permuta(sel) = []; + ipermuta(permuta) = 1:length(permuta); + permutaBranches = permuta(permuta>numLeaves)-numLeaves; + % update all tree structure fields + tr.names = tr.names(permuta); + tr.dist = tr.dist(permuta); + tr.dist(end) = 0; + tr.tree = tr.tree(permutaBranches,:); + tr.tree = ipermuta(tr.tree); + +else % exclusiveMode (the selected nodes are not deleted, only their descendants) + + % unselect leaves which are already in the top + sel(1:numLeaves)=sel(parents(1:numLeaves)); + % find the top selected nodes in order to edit branches + top = [~sel(parents);1] & sel; + % find the new leaves (no deleted leaves + branches that become leaves) + newLeaves = [~sel(1:numLeaves);top(numLeaves+1:end)]; + % find which branches will stay + stayingBranches = ~sel(numLeaves + 1 : numLabels); + % setting new indexes to change the tree architecture + permuta = [find(newLeaves);numLeaves+find(stayingBranches)]; + ipermuta(permuta) = 1:length(permuta); + % update all tree structure fields + tr.names = tr.names(permuta); + tr.dist = tr.dist(permuta); + tr.dist(end) = 0; + tr.tree = tr.tree(stayingBranches,:); + tr.tree = ipermuta(tr.tree); + % calling phytree with this format to force edge-crossing check + tr = phytree(tr.tree,tr.dist,tr.names); + +end % if ~exclusiveMode + + diff --git a/matlab/graph/@phyTree/SET.m b/matlab/graph/@phyTree/SET.m new file mode 100644 index 0000000..e2ab488 --- /dev/null +++ b/matlab/graph/@phyTree/SET.m @@ -0,0 +1,21 @@ +function tr = set(tr,varargin) %#ok +%SET Set object properties of a phylogenetic tree object. +% +% Properties in a phylogenetic tree object can not be manually set. +% A PHYTREE object must be created by its constructor method PHYTREE +% or by using one of the functions: PHYTREEREAD, SEQLINKAGE, SEQNEIGHJOIN. +% +% See also: PHYTREE, PHYTREEREAD, SEQLINKAGE, SEQNEIGHJOIN. + +% Copyright 2003-2006 The MathWorks, Inc. +% $Revision: 1.1.8.2.2.1 $ $Author: batserve $ $Date: 2006/07/27 21:37:51 $ + +% error('Bioinfo:phytree:set:NotAllowedMethod',... +% ['Properties in a phylogenetic tree object can not be manually set.\n'... +% 'A PHYTREE object must be created by its constructor method PHYTREE\n'... +% 'or by using one of the functions: PHYTREEREAD, SEQLINKAGE, SEQNEIGHJOIN.']) + +% numBranches = size(tr.tree,1); +% numLeaves = numBranches + 1; +% +% tr.names(1:numLeaves)= varargin{2}'; \ No newline at end of file diff --git a/matlab/graph/@phyTree/Select.m b/matlab/graph/@phyTree/Select.m new file mode 100644 index 0000000..aaf28fb --- /dev/null +++ b/matlab/graph/@phyTree/Select.m @@ -0,0 +1,313 @@ +function [sel,sell,selb] = select(tr,varargin) +%SELECT Selects tree branches and leaves. +% +% S = SELECT(T,N) returns a logical vector S of size [NUMNODES x 1] +% indicating N closest nodes to the root node of the phylogenetic tree +% object T (NUMNODES = NUMLEAVES + NUMBRANCHES). The first criteria used +% is branch levels, then patristic distance, also known as tree distance. +% By default SELECT, uses INF as the value of N, therefore SELECT(T) will +% return a vector of 'trues'. +% +% S = SELECT(...,'REFERENCE',R) changes the reference point(s) to measure +% the closeness. R can be 'root' (default) or 'leaves'. When using +% 'leaves', a node to be tested may have different distances to its +% descendant leaves, which are the references (e.g. a non-ultrametric +% tree), if this the case the minimum distance to any descendant leaf +% will be considered. R may also be an index which points to any node of +% the tree. +% +% S = SELECT(...,'CRITERIA',C) changes the criteria used to measure +% closeness. If C='levels' (default) then the first criteria is branch +% levels and then patristic distance. If C='distance' then the first +% criteria is patristic distance and then branch levels. +% +% S = SELECT(...,'THRESHOLD',V) selects all the nodes which closeness is +% less or equal than the threshold value V. Observe that either +% 'criteria' or either 'reference' can be used. If N is not specified +% N = INF, otherwise the output can be further size limited by N. +% +% S = SELECT(...,'EXCLUDE',E) sets a post-filter which excludes all the +% branch nodes from S when E=='branches' or all the leaf nodes when +% E=='leaves'. The default is 'none'. +% +% S = SELECT(...,'PROPAGATE',P) activates a post-functionality which +% propagates the selected nodes to the leaves when P=='toleaves' or +% towards the root finding a common ancestor when P=='toroot'. The +% default is 'none', P may also be 'both'. 'PROPAGATE' switch acts after +% 'EXCLUDE' switch. +% +% [S,SELLEAVES,SELBRANCHES] = SELECT(...) returns two additional logical +% vectors, one for the selected leaves and one for the selected branches. +% +% Examples: +% +% % Load a phylogenetic tree created from a protein family: +% tr = phytreeread('pf00002.tree'); +% +% % To find close products for a given protein (e.g. vips_human): +% ind = getbyname(tr,'vips_human'); +% [sel,sel_leaves] = select(tr,'criteria','distance',... +% 'threshold',0.6,'reference',ind); +% view(tr,sel_leaves) +% +% % To find potential outliers in the tree use: +% [sel,sel_leaves] = select(tr,'criteria','distance','threshold',.3,... +% 'reference','leaves','exclude','leaves','propagate','toleaves'); +% view(tr,~sel_leaves) +% +% +% See also PHYTREE, PHYTREE/GET, PHYTREE/PDIST, PHYTREE/PRUNE, PHYTREETOOL. + +% +% Copyright 2003-2006 The MathWorks, Inc. +% $Revision: 1.1.6.7.2.1 $ $Author: batserve $ $Date: 2006/07/27 21:37:50 $ + +if numel(tr)~=1 + error('Bioinfo:phytree:select:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end + +% set defaults +V=inf; +CriteriaIsDistance = false; +ReferenceIs = 'root'; +ExcludeSwitch = false; +PostPropagate = false; + +% check is first argument is N, otherwise set N to default +if (nargin>1 && isnumeric(varargin{1}) && isreal(varargin{1})) + N = floor(varargin{1}); + first_arg = 3; +else + N = inf; + first_arg = 2; +end + +numBranches = size(tr.tree,1); +numLeaves = numBranches + 1; +numLabels = numBranches + numLeaves; + +% identify input arguments +if nargin - first_arg + 1 > 0 + if rem(nargin - first_arg,2) == 0 + error('Bioinfo:phytree:select:IncorrectNumberOfArguments',... + 'Incorrect number of arguments to %s.',mfilename); + end + okargs = {'reference','criteria','threshold','exclude','propagate'}; + for j=first_arg - 1 : 2 : nargin - first_arg + 1 + pname = varargin{j}; + pval = varargin{j+1}; + k = find(strncmpi(pname,okargs,numel(pname))); + if isempty(k) + error('Bioinfo:phytree:select:UnknownParameterName',... + 'Unknown parameter name: %s.',pname); + elseif length(k)>1 + error('Bioinfo:phytree:select:AmbiguousParameterName',... + 'Ambiguous parameter name: %s.',pname); + else + switch(k) + case 1 % reference + if islogical(pval) + if numel(pval)==numLabels + pval = pval(:)==true; + elseif numel(pval)==numLeaves + pval = [pval(:);false(numBranches,1)]; + elseif numel(pval)==numBranches + pval = [false(numLeaves,1);pval(:)]; + else + error('Bioinfo:phytree:select:InvalidSizeLogicalReferenceNode',... + 'When reference node is a logical vector it must contain NUMNODES, NUMLEAVES or NUMBRANCHES elements.') + end + pval = find(pval); + ReferenceIs = 'node'; + if numel(pval) ~= 1 + error('Bioinfo:phytree:select:InvalidValuesLogicalReferenceNode',... + 'When reference node is a logical vector one element must be true and all others false.') + else + ReferenceNode = pval; + end + elseif isnumeric(pval) + ReferenceIs = 'node'; + if numel(pval) ~= 1 + error('Bioinfo:phytree:select:InvalidSizeReferenceNode',... + 'Reference node must be scalar.') + elseif all(pval~=1:numLabels) + error('Bioinfo:phytree:select:InvalidValueReferenceNode',... + 'Incorrect reference node.') + else + ReferenceNode = pval; + end + else + h = strmatch(lower(pval),{'root','leaves'}); %#ok + if numel(h) + switch(h) + case 1 + ReferenceIs = 'root'; + case 2 + ReferenceIs = 'leaves'; + end + else error('Bioinfo:phytree:select:InvalidStringReferenceNode',... + 'Invalid string for the reference node.'); + end + end + case 2 % criteria + h = strmatch(lower(pval),{'distance','levels'}); %#ok + if numel(h) + CriteriaIsDistance = (h == 1); + else error('Bioinfo:phytree:select:InvalidCriteria',... + 'Invalid string for criteria.'); + end + case 3 % threshold + V = pval; + if (~isnumeric(V) || ~isreal(V) || numel(V)>1) + error('Bioinfo:phytree:select:InvalidThreshold',... + 'Invalid value for V.'); + end + case 4 % exclude + h = strmatch(lower(pval),{'branches','leaves','none'}); %#ok + if numel(h) + switch(h) + case 1 + ExcludeSwitch = true; + ExcludeType = 'branches'; + case 2 + ExcludeSwitch = true; + ExcludeType = 'leaves'; + case 3 + ExcludeSwitch = false; + end + else error('Bioinfo:phytree:select:InvalidExcludeOption',... + 'Invalid string for exclude switch.'); + end + case 5 % propagate + h = strmatch(lower(pval),{'toleaves','toroot','both','none'}); %#ok + if numel(h) + switch(h) + case 1 + PostPropagate = true; + PostPropagateType = 'toleaves'; + case 2 + PostPropagate = true; + PostPropagateType = 'toroot'; + case 3 + PostPropagate = true; + PostPropagateType = 'both'; + case 4 + PostPropagate = false; + end + else error('Bioinfo:phytree:select:InvalidPostPropagate',... + 'Invalid string for post-propagate switch.'); + end + end % switch(k) + end % if ... + end % for j=... +end % nargin + +% calculate the distance (and levels) of every node to the reference +levels2Ref = zeros(numLabels,1); +switch ReferenceIs + case 'root' + % calculate the distance to the root for every node + dist2Ref = tr.dist; + for ind = numBranches:-1:1 + dist2Ref(tr.tree(ind,:)) = ... + dist2Ref(tr.tree(ind,:)) + dist2Ref(ind+numLeaves); + levels2Ref(tr.tree(ind,:)) = levels2Ref(ind+numLeaves) + 1; + end + case 'leaves' + % calculate the distance to the closest leaf for every node + dist2Ref = zeros(numLabels,1); + for ind = 1:numBranches + dist2Ref(ind+numLeaves) = ... + min(dist2Ref(tr.tree(ind,:))+tr.dist(tr.tree(ind,:))); + levels2Ref(ind+numLeaves) = min(levels2Ref(tr.tree(ind,:))) + 1; + end + case 'node' + refVector = zeros(numLabels,1); + refVector(ReferenceNode)=1; + dist2Ref = pdist(tr,'SquareForm',true,'nodes','all')... + * refVector; + tr.dist = ones(numLabels,1); % to count now levels ! + levels2Ref = pdist(tr,'SquareForm',true,'nodes','all') ... + * refVector; +end + +% applies the threshold value +if CriteriaIsDistance + sel = dist2Ref < V; +else % ~CriteriaIsDistance + sel = levels2Ref < V; +end % if CriteriaIsDistance + +% needs to remove additional nodes because of N +if sum(sel)>N + if CriteriaIsDistance + [dum,h]=sortrows([dist2Ref levels2Ref]); %#ok + else % ~CriteriaIsDistance + [dum,h]=sortrows([levels2Ref dist2Ref]); %#ok + end % if CriteriaIsDistance + g=h(sel(h)); + sel(g(N+1:end))=false; +end + +% exclude option +if ExcludeSwitch + switch ExcludeType + case 'branches' + sel((1+numLeaves):numLabels) = false; + case 'leaves' + sel(1:numLeaves) = false; + end +end + +% post-propagate option +if PostPropagate + + % expands all the current nodes towards the leaves + if any(strcmp({'toleaves','both'},PostPropagateType)) + for ind = numBranches:-1:1 + if sel(ind+numLeaves) + sel(tr.tree(ind,:))=true; + end + end + end + + % propagates towards the root finding the common ancestors + if any(strcmp({'toroot','both'},PostPropagateType)) + + % find closest common branch for every pair of nodes + % diagonal is invalid ! but not needed + + % initializing full matrix + commf = zeros(numLabels,'int16'); + children = false(1,numLabels); + for ind = numBranches:-1:1 + children(:) = false; + children(ind+numLeaves) = true; + for ind2 = ind:-1:1 + if children(ind2+numLeaves) + children(tr.tree(ind2,:))=true; + end + end + commf(children,children)=int16(ind); + end + commf = commf(sel,sel); + commf = commf - diag(diag(commf)); + commf = unique(commf(commf(:)>0)); + sel(commf+numLeaves) = true; + + % now propagates towards the common ancestor + for ind = 1:double(max(commf)) + if any(sel(tr.tree(ind,:))) + sel(ind+numLeaves) = true; + end + end + end +end + +if nargout > 1 + sell = sel(1:numLeaves); +end +if nargout > 2 + selb = sel(numLeaves+1:numLabels); +end diff --git a/matlab/graph/@phyTree/Subtree.m b/matlab/graph/@phyTree/Subtree.m new file mode 100644 index 0000000..f119bde --- /dev/null +++ b/matlab/graph/@phyTree/Subtree.m @@ -0,0 +1,91 @@ +function subtr = subtree(tr,nodes) +%SUBTREE Extracts a subtree. +% +% T2 = SUBTREE(T1,NODES) Extracts a new subtree T2 in which the new root +% is the first common ancestor of the NODES vector from T1. NODES in the +% tree are indexed as [1:NUMLEAVES] for the leaves and as [NUMLEAVES+1 : +% NUMLEAVES+NUMBRANCHES] for the branches. NODES can also be a logical +% array of following sizes: [NUMLEAVES+NUMBRANCHES x 1], [NUMLEAVES x 1] +% or [NUMBRANCHES x 1]. +% +% Example: +% +% % Load a phylogenetic tree created from a protein family: +% tr = phytreeread('pf00002.tree') +% +% % Get the subtree that contains the VIPS and CGRR human proteins: +% sel = getbyname(tr,{'vips_human','cgrr_human'}); +% sel = any(sel,2); +% tr = subtree(tr,sel) +% view(tr); +% +% See also PHYTREE, PHYTREE/PRUNE, PHYTREE/SELECT, PHYTREE/GET, +% PHYTREE/GETBYNAME. + +% Copyright 2003-2005 The MathWorks, Inc. +% $Revision: 1.1.8.1 $ $Author: batserve $ $Date: 2005/06/09 21:57:04 $ + +if numel(tr)~=1 + error('Bioinfo:phytree:subtree:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end +numBranches = size(tr.tree,1); +numLeaves = numBranches + 1; +numLabels = numBranches + numLeaves; + +% validate nodes +if islogical(nodes) + if numel(nodes)==numLabels + nodes = nodes(:)==true; + elseif numel(nodes)==numLeaves + nodes = [nodes(:);false(numBranches,1)]; + elseif numel(nodes)==numBranches + nodes = [false(numLeaves,1);nodes(:)]; + else + error('Bioinfo:phytree:subtree:IncorrectSizeInputVector',... + 'Logical vector must have the same number of elements as nodes in the Phylogenetic Tree.'); + end +elseif isnumeric(nodes) && isreal(nodes) && all(nodes(:)>=1) && all(nodes(:)<=numLabels) + tem = false(numLabels,1); + tem(floor(nodes(:))) = true; + nodes=tem(:); +else + error('Bioinfo:phytree:subtree:InvalidInputNodes',... + 'Invalid value for NODES.'); +end + +% at this point NODES should only be a logical vector + +if (~any(nodes(numLeaves+1:numLabels)) && (sum(nodes(1:numLeaves))<2)) + error('Bioinfo:phytree:subtree:InvalidSubtree',... + 'Subtree must contain at least two leaves.'); +end + + +% look for the first common ancestor that contains all selected nodes, +% accumulating the selected nodes towards the root, the common ancestor +% will be the first sum equal to the number of selected nodes +branchWidth = double(nodes); +for ind = 1:numBranches + branchWidth(numLeaves+ind) = branchWidth(numLeaves+ind) + ... + sum(branchWidth(tr.tree(ind,:))); +end +commonAncestor = find(branchWidth==sum(nodes),1); + +% now propagate the ancestor new) root towards the leaves to find all the +% nodes that should stay for the subtree +sel = false(1,numLabels); +sel(commonAncestor) = true; +for ind = commonAncestor:-1:numLeaves+1 + sel(tr.tree(ind-numLeaves,:)) = sel(ind); +end + +% extract the subtree +permuta = find(sel); +subtr=phytree; +ipermuta(permuta) = 1:length(permuta); +subtrNumLeaves = (ipermuta(end) + 1)/2; +subtr.tree = ipermuta(tr.tree(permuta(subtrNumLeaves+1:end)-numLeaves,:)); +subtr.dist = tr.dist(permuta); +subtr.names = tr.names(permuta); +subtr.dist(end) = 0; diff --git a/matlab/graph/@phyTree/View.m b/matlab/graph/@phyTree/View.m new file mode 100644 index 0000000..2fef37e --- /dev/null +++ b/matlab/graph/@phyTree/View.m @@ -0,0 +1,1618 @@ +function View(tr,sel,propsForFigure) +%VIEW views a phylogenetic tree in phytreetool. +% +% VIEW(TREE) shows a phylogenetic tree object. The significant distances +% between branches and nodes are in horizontal direction, vertical +% coordinates are accommodated only for display purposes. Tree +% Edit/Analysis tools are accessible through the mouse left/right buttons +% and also using the 'Tree' menu. +% +% VIEW(TREE,SEL) starts the viewer with an initial selection of nodes +% specified by SEL. SEL can be a logical array of any of the following +% sizes: [NUMLEAVES+NUMBRANCHES x 1], [NUMLEAVES x 1], or [NUMBRANCHES x +% 1]. SEL may also be a list of indices. +% +% Examples: +% +% tr = phytreeread('pf00002.tree') +% view(tr) +% +% See also PHYTREE, PHYTREE/PLOT, PHYTREEREAD, PHYTREETOOL, SEQLINKAGE, +% SEQNEIGHJOIN. + +% Copyright 2003-2006 The MathWorks, Inc. +% $Revision: 1.1.6.19.2.1 $ $Author: batserve $ $Date: 2006/07/24 13:54:28 $ + +if numel(tr)~=1 + error('Bioinfo:phytree:view:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end + +tr = doBasicCalculations(tr); + +nodeIndex = 1:tr.numLabels; +leafIndex = 1:tr.numLeaves; +branchIndex = tr.numLeaves+1:tr.numLabels; + +% check empty names +for ind = nodeIndex + if isempty(tr.names{ind}) + if ind > tr.numLeaves + tr.names{ind} = ['Branch ' num2str(ind-tr.numLeaves)]; + else + tr.names{ind} = ['Leaf ' num2str(ind)]; + end + end +end + +% initial drawing +if nargin<3 + propsForFigure.Name = ['Phylogenetic Tree Tool ' getphytreetoolnumber]; +end +propsForFigure.PruneWarning = getacceptedwarningfromothertools; +fig = figure('Renderer','ZBuffer','Name',propsForFigure.Name,... + 'NumberTitle','off','IntegerHandle','off','tag','PhyTreeTool'); +setappdata(fig,'propsForFigure',propsForFigure) +setappdata(fig,'backupTree',tr) +tr.ha = axes; hold on; +set(tr.ha,'Position',[.05 .05 .7 .9],'YTick',leafIndex,'FontSize',9,'Ydir','reverse',... + 'YAxisLocation','Right','YTickLabel',char(tr.names{leafIndex})) +tr.hlines = plot( ... + tr.x([nodeIndex;repmat([tr.par(1:tr.numLabels-1) tr.numLabels],2,1)]),... + tr.y([repmat(nodeIndex,2,1);[tr.par(1:tr.numLabels-1) tr.numLabels]]),... + '-k'); +tr.hpathline = plot(1,1,'--r','LineWidth',2,'Visible','off'); +tr.hdragbox = plot(1,1,':k','LineWidth',1,'Visible','off'); +tr.hdots(1,1) = plot(tr.x(branchIndex),tr.y(branchIndex),'o',... + 'MarkerSize',5,'MarkerEdgeColor','k','MarkerFaceColor','b'); +tr.hdots(1,2) = plot(tr.x(leafIndex),tr.y(leafIndex),'square',... + 'MarkerSize',4,'MarkerEdgeColor','k','MarkerFaceColor','w'); +tr.hseldots(1,1) = plot(tr.x(branchIndex),tr.y(branchIndex),'o',... + 'MarkerSize',5,'MarkerEdgeColor','r','MarkerFaceColor','r'); +tr.hseldots(1,2) = plot(tr.x(leafIndex),tr.y(leafIndex),'square',... + 'MarkerSize',4,'MarkerEdgeColor','r','MarkerFaceColor','r'); +tr.hldots(1,1) = plot(tr.x(branchIndex),tr.y(branchIndex),'o',... + 'MarkerSize',5,'MarkerEdgeColor',[.5 .5 .5],... + 'MarkerFaceColor',[.6 .6 1]); +tr.hldots(1,2) = plot(tr.x(leafIndex),tr.y(leafIndex),'square',... + 'MarkerSize',4,'MarkerEdgeColor',[.5 .5 .5],... + 'MarkerFaceColor','w'); +set(tr.hldots(1),'Xdata',[],'Ydata',[]) +set(tr.hldots(2),'Xdata',[],'Ydata',[]) +tr.axhold = plot([-eps -eps],[0 0],'.','MarkerSize',eps,'Color','w'); +tr.datatip = text(0,1,1,'k','Tag','TreeTag','BackgroundColor',[1 1 .93],... + 'Color', [0 0 0],'EdgeColor', [0.8 0.8 0.8],... + 'VerticalAlignment','Top','Clipping','off',... + 'Visible','off','Fontsize',8,'Interpreter','none'); + +if nargin == 1 || isempty(sel) + tr.selected = false(tr.numLabels,1); % selected nodes +else + % validate sel + if islogical(sel) + if numel(sel)==tr.numLabels + sel = sel(:)==true; + elseif numel(sel)==tr.numLeaves + sel = [sel(:);false(tr.numBranches,1)]; + elseif numel(sel)==tr.numBranches + sel = [false(tr.numLeaves,1);sel(:)]; + else + close(fig) + error('Bioinfo:phytree:view:IncorrectLogical',... + 'Logical vector must have the same number of elements as nodes in the Phylogenetic Tree'); + end + elseif isnumeric(sel) && isreal(sel) && all(sel>=1) && all(sel<=tr.numLabels) + tem(tr.numLabels)=false; + tem(floor(sel))=true; + sel=tem(:); + else + close(fig) + error('Bioinfo:phytree:view:IncorrectTypeofArguments',... + 'Invalid value for NODES'); + end + tr.selected =sel; +end + +% save more figure data needed for the gui functionality +tr.activeNodes = true(tr.numLabels,1); % active nodes +tr.activeBranches = true(tr.numBranches,1); % active Branches +tr.sel2root = false(tr.numLabels,1); % path sel-node to root +tr.editMode = 'Select'; % initial edit mode +tr.indicativeMode = false; % data-tip flag +tr.lastThresholdValue = []; % remembers last cut + +% create uicontrols (will appear as needed, initially invisible) +tr.editBox = uicontrol(fig,'Background',[1 1 1],'style','edit',... + 'visible','off','callback',@doneRenaming); +tr.slider = uicontrol(fig,'style','slider','SliderStep',[.1 .1],... + 'visible','off','callback',@sliderCallback); +tr.slidertx = uicontrol(fig,'style','text','visible','off'); +tr.sliderok = uicontrol(fig,'style','pushbutton','visible','off',... + 'string','OK','callback',@doThresholdCut); + +% setup callback for click over nodes +set([tr.hseldots,tr.hdots,tr.hldots],'ButtonDownFcn',@toggleNode) +% setup figure callback functions +set(fig,'WindowButtonDownFcn',@mouseClickOnFigure); +set(fig,'WindowButtonUpFcn',@mouseRelease); +set(fig,'WindowButtonMotionFcn',@localWindowButtonMotion); + +% setup UIMenus, context menus and toolbar +tr.hToggleUIMenu = makePhyTreeViewerUIMenus(fig); +tr.hToggleToolbar = makePhyTreeViewerToolbar(fig); +[tr.hToggleContextMenu,tr.hAxisContextMenu,tr.hDotsContextMenu] = ... + makePhyTreeViewerContextMenus(fig); +% activate Context Menus +set(tr.ha,'UIContextMenu',tr.hAxisContextMenu); +set([tr.hdots tr.hldots tr.hseldots],'UIContextMenu',tr.hDotsContextMenu); + +set(fig,'UserData',tr) % save figure data + +correctFigureSize(fig, 15 * tr.numLeaves); % resize figure if needed +setupYLabelsListeners; % listeners for YLabels +updateTree(fig,[],[]) % updates figure after all initializations +set(gca,'xLim',[0 max(tr.x)] + max(tr.x) * [-.1 .05]); +tr.yLim = get(tr.ha,'Ylim');tr.xLim = get(tr.ha,'Xlim'); +set(fig,'UserData',tr) % save figure data +toolsmenufcn(fig,'PanY') % set zoom mode to vertical constraining +toolsmenufcn(fig,'ZoomY') % set pan mode to vertical constraining +set(fig,'HandleVisibility','callback') % after all init, make it invisible + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function ylabelsListener(hSrc,event,hf,ha) %#ok +% Auto sizes the ylabels +ratio = max(get(hf,'Position').*[0 0 0 1])/diff(get(ha,'YLim')); +set(ha,'Fontsize',min(9,ceil(ratio/1.7))); % the gold formula +% Also verify if we need to re-position the slidebar of threshold cut +tr=get(hf,'Userdata'); +if any(strcmp(tr.editMode,{'Distance to Leaves','Distance to Root'})) + wS = get(hf,'Position'); % window dimensions + aP = get(tr.ha,'Position'); % axes position + set(tr.slider, 'Position',[aP(1)*wS(3) wS(4)-20 aP(3)*wS(3) 20]) + set(tr.slidertx,'Position',[sum(aP([1 3]))*wS(3) wS(4)-20 60 20]) + set(tr.sliderok,'Position',[sum(aP([1 3]))*wS(3)+60 wS(4)-20 30 20]) +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function mouseClickOnFigure(h,varargin) +% This callback function is activated when a mouse button is pressed in any +% location of the figure and under any of my edit modes +tr = get(gcbf,'Userdata'); +switch tr.editMode + case 'Renaming'; doneRenaming(h,varargin); + case 'Distance to Leaves'; cancelThresholdCut(h,varargin); + case 'Distance to Root'; cancelThresholdCut(h,varargin); + case 'Select'; + switch get(gcbf,'SelectionType') + case {'normal','extend'} + tr = get(gcbf,'userdata'); + cp = get(tr.ha,'CurrentPoint'); + xPos = cp(1,1); yPos = cp(1,2); + set(tr.hdragbox,'Visible','on',... + 'Xdata',repmat(xPos,5,1),'Ydata',repmat(yPos,5,1)) + case 'open' + autoFit(h) + end +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function toggleNode(h,varargin) +% This callback function is activated when a mouse button is pressed over +% any of the displayed nodes under any of my edit modes +hideActiveIndicators(h,varargin) +tr = get(gcbf,'Userdata'); +switch get(gcbf,'SelectionType') + case 'normal' + switch tr.editMode + case 'Select'; selectNode(h,varargin); + case 'Inspect'; inspectNode(h,varargin); + case 'Collapse/Expand'; collapseExpand(h,varargin); + case 'Rotate Branch'; rotateBranch(h,varargin); + case 'Rename'; renameNode(h,varargin); + case 'Renaming'; doneRenaming(h,varargin); + case 'Prune'; pruneTree(h,varargin); + case 'Distance to Leaves'; cancelThresholdCut(h,varargin); + case 'Distance to Root'; cancelThresholdCut(h,varargin); + end + case 'extend' + switch tr.editMode + case 'Select'; selectNode(h,varargin); + end + case 'alt' + case 'open' + end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function changeEditMode(h,varargin) %#ok +% Callback function to change the edit mode, this function is +% called from the toolbar, the context menu or the uimenu. +tr = get(gcbf,'Userdata'); +myModes = {'Inspect','Collapse/Expand','Rotate Branch','Rename','Prune'}; + +% first, disable any present edit mode +switch tr.editMode + case {myModes{:},'Select'}; + disableMyContextMenus(h) + disableMyWindowButtonActions(h) + ind = strmatch(tr.editMode,myModes); %#ok + set(tr.hToggleToolbar(ind), 'State','off') + set(tr.hToggleUIMenu(ind), 'Checked','off') + set(tr.hToggleContextMenu(ind),'Checked','off') + %case '&Zoom In'; toolsmenufcn(gcbf,'ZoomIn'); + case '&Zoom In'; zoom(gcbf,'off') + %case 'Zoom &Out'; toolsmenufcn(gcbf,'ZoomOut'); + case 'Zoom &Out'; zoom(gcbf,'off') + %case '&Pan'; toolsmenufcn(gcbf,'Pan'); + case '&Pan'; pan(gcbf,'off'); + case {'Distance to Leaves','Distance to Root'} + enableAllUI(h) + disableMyWindowButtonActions(h) +end + +% depending on the caller instance, determine the new edit mode +switch get(h,'Type') + case 'uimenu'; newEditMode = get(h,'Label'); + case 'uitoggletool' + newEditMode = get(h,'Tag'); + switch newEditMode + case 'Exploration.ZoomIn'; newEditMode = '&Zoom In'; + case 'Exploration.ZoomOut'; newEditMode = 'Zoom &Out'; + case 'Exploration.Pan'; newEditMode = '&Pan'; + end + otherwise; newEditMode = 'Select'; +end +%disp( [tr.editMode ' --> ' newEditMode] ) +% if new mode is the same then we are toggling off +if strcmp(newEditMode,tr.editMode) + newEditMode = 'Select'; +end + +% if changing to Prune, verify the warnign has been accepted +if strcmp(newEditMode,'Prune') + propsForFigure = getappdata(gcbf,'propsForFigure'); + if isequal(propsForFigure.PruneWarning,'NotDone') + warndlg(['Pruning nodes cannot be undone. Before continuing,',... + ' you may want to export the current tree to a new tool.'],... + 'Warning','modal') + setacceptedwarningtoothertools + end +end + +switch newEditMode + case '&Zoom In'; toolsmenufcn(gcbf,'ZoomIn'); + case 'Zoom &Out'; toolsmenufcn(gcbf,'ZoomOut'); + %case '&Pan'; toolsmenufcn(gcbf,'Pan'); + case '&Pan'; pan(gcbf,'on'); + case myModes; enableMyContextMenus(h) + enableMyWindowButtonActions(h) + ind = strmatch(newEditMode,myModes); %#ok + set(tr.hToggleToolbar(ind), 'State','on') + set(tr.hToggleUIMenu(ind), 'Checked','on') + set(tr.hToggleContextMenu(ind),'Checked','on') + case 'Select'; enableMyContextMenus(h) + enableMyWindowButtonActions(h) + case {'Distance to Leaves','Distance to Root'} + disableAllUI(h) + enableMyWindowButtonActions(h) + set(gcbf,'WindowButtonMotionFcn',[]) + set(gcbf,'WindowButtonUpFcn',[]) + set([tr.hseldots,tr.hdots,tr.hldots],'ButtonDownFcn',[]) +end + +switch newEditMode + case 'Inspect'; if sum(tr.selected(:)) ~= 1 + tr.selected(:) = false; + tr.selected(end) = true; + end + otherwise +end + +tr.sel2root = path2root(tr, tr.selected); +tr.editMode = newEditMode; +set(gcbf,'userdata',tr) +updateTree(gcbf,[],[]) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function hideActiveIndicators(h,varargin) %#ok +tr = get(gcbf,'userdata'); +set([tr.hpathline,tr.datatip],'visible','off') + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function disableAllUI(h,varargin) %#ok +hw = findall(gcbf,'Type','uimenu','Parent',gcbf); +gw = findall(gcbf,'Type','UIToggleTool'); +set([hw;gw],'Enable','off') + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function enableAllUI(h,varargin) %#ok +hw = findall(gcbf,'Type','uimenu','Parent',gcbf); +gw = findall(gcbf,'Type','UIToggleTool'); +set([hw;gw],'Enable','on') + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function disableMyContextMenus(h,varargin) %#ok +tr = get(gcbf,'userdata'); +set(tr.ha,'UIContextMenu',[]); +set([tr.hdots tr.hseldots],'UIContextMenu',[]); + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function enableMyContextMenus(h,varargin) %#ok +tr = get(gcbf,'userdata'); +set(tr.ha,'UIContextMenu',tr.hAxisContextMenu); +set([tr.hdots tr.hldots tr.hseldots],'UIContextMenu',tr.hDotsContextMenu); + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function disableMyWindowButtonActions(h,varargin) %#ok +set(gcbf,'WindowButtonDownFcn',[]); +set(gcbf,'WindowButtonUpFcn',[]) +set(gcbf,'WindowButtonMotionFcn',[]); +tr = get(gcbf,'userdata'); +set([tr.hseldots,tr.hdots,tr.hldots],'ButtonDownFcn',[]) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function enableMyWindowButtonActions(h,varargin) %#ok +set(gcbf,'WindowButtonDownFcn',@mouseClickOnFigure); +set(gcbf,'WindowButtonUpFcn',@mouseRelease); +set(gcbf,'WindowButtonMotionFcn',@localWindowButtonMotion); +tr = get(gcbf,'userdata'); +set([tr.hseldots,tr.hdots,tr.hldots],'ButtonDownFcn',@toggleNode) +set(gcbf,'KeyPressFcn',[]); +set(gcbf,'Pointer','arrow'); + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function localWindowButtonMotion(h,varargin) %#ok +% Callback function activated when moving over the axes, checks location of +% the mouse and puts datatip if over an active node. + +tr = get(h,'userdata'); +% set a virtual grid to get the point +xThres=diff(get(tr.ha,'Xlim'))/100; +yThres=diff(get(tr.ha,'Ylim'))/100; +cp = get(tr.ha,'CurrentPoint'); +xPos = cp(1,1); yPos = cp(1,2); +hp = tr.x<(xPos+xThres) & tr.x>(xPos-xThres) & ... + tr.y<(yPos+yThres) & tr.y>(yPos-yThres); +hp = find (hp & tr.activeNodes); + +% shortcut out when dragging a box in select mode +if strcmp(get(tr.hdragbox,'Visible'),'on') + xdata = get(tr.hdragbox,'XData');xdata([3,4]) = xPos; + ydata = get(tr.hdragbox,'YData');ydata([2,3]) = yPos; + set(tr.hdragbox,'XData',xdata,'YData',ydata) +% shortcut out when turning off 'indicative' mode +elseif tr.indicativeMode && isempty(hp) %&& isempty(tr.highligth) + set([tr.datatip tr.hpathline],'visible','off') + set(tr.hlines,'color','black') + set(tr.hldots(1),'Xdata',[],'Ydata',[]) + set(tr.hldots(2),'Xdata',[],'Ydata',[]) +% turn on or update 'indicative' mode +elseif numel(hp) % && isempty(tr.highligth) + % find leaves (children) below this branch + children = false(1,tr.numLabels); + children(hp(1)) = true; + for ind = hp(1)-tr.numLeaves:-1:1 + if children(ind+tr.numLeaves) + children(tr.tree(ind,:))=true; + end + end + + % find and draw path to selected + if strcmp(tr.editMode,'Inspect') + [pathA,pathB] = path2sel(tr,hp(1)); + dis2sel = tr.x(find(pathA,1))+tr.x(find(pathB,1))... + -2*tr.x(find(pathA,1,'last')); + if any(pathB) + xx = [tr.x(pathA);NaN;tr.x(pathB)]; + yy = [tr.y(pathA);NaN;tr.y(pathB)]; + hh=zeros(2*numel(xx),1); hh(1:2:end)=1; hh=cumsum(hh); + set(tr.hpathline,'XData',xx(hh(2:end)),... + 'YData',yy(hh(1:end-1)),'Visible','on'); + end + end + + % place text + name = [tr.names{hp(1)} ' ']; + name(name=='_')=' '; + children(hp(1)) = false; + numChil = sum(children(1:tr.numLeaves)); + childrenNames = char(tr.names(children(1:tr.numLeaves))); + childrenNames(childrenNames=='_')=' '; + childrenNames=[repmat(' ',size(childrenNames,1),1) childrenNames]; + switch tr.editMode + case 'Inspect' + if numChil + set(tr.datatip,'string',char( [ {name; ... + ['Dist to parent: ' num2str(tr.dist(hp(1)))];... + ['Dist to root: ' num2str(tr.x(hp(1))-tr.x(end))];... + ['Path length: ' num2str(dis2sel)];... + ['Samples: ' num2str(numChil)]};... + mat2cell(childrenNames,ones(size(childrenNames,1),1),... + size(childrenNames,2))])) + extraLines = 5; + else + set(tr.datatip,'string',char( {name; ... + ['Dist to parent: ' num2str(tr.dist(hp(1)))];... + ['Dist to root: ' num2str(tr.x(hp(1))-tr.x(end))];... + ['Path length: ' num2str(dis2sel)]})) + extraLines = 4; + end + case {'Collapse/Expand','Rotate Branch','Rename','Prune','Select'} + if numChil + set(tr.datatip,'string',char([ + {[name ' (' num2str(numChil) ' samples)']};... + mat2cell(childrenNames,ones(size(childrenNames,1),1),... + size(childrenNames,2))])) + else + set(tr.datatip,'string', name) + end + extraLines = 1; + otherwise % all other modes + end + + %compute some values before adjusting data tip + fp = get(gcbf,'Position'); % fig position in points + fh = fp(4);%fw = fp(3); % fig size (height & width) in points + ap = get(tr.ha,'Position'); % axis position normalized + yl = ylim(tr.ha); yl = yl - ... + [ap(2) ap(2)+ap(4)-1]*diff(yl)/ap(4); % fig height limits in axis units + xl = xlim(tr.ha); xl = xl - ... + [ap(1) ap(1)+ap(3)-1]*diff(xl)/ap(3); % fig width limits in axis units + yPosPt = (-yPos -4*yThres + yl(2))*fh/diff(yl); % datatip position in pts + reqPt = (numChil+extraLines)*14+2; % required datatip height in pts + % adjust if other fontsize is used + + %adjust string of datatip if it will not fit (i.e. remove names) + if reqPt > fh + str = get(tr.datatip,'String'); + set(tr.datatip,'String',str(1:extraLines,:)); + reqPt = extraLines*14+2; + end + + %adjust vertical position of datatip just below cp + topEdge = yl(2)-min(fh,max(yPosPt,reqPt))*diff(yl)/fh; + switch tr.editMode + case {'Collapse/Expand','Rotate Branch','Prune'} + % datatip usually to the left of cp to see shadowing of branches + datatipExtent = get(tr.datatip,'Extent'); + datatipWidth = datatipExtent(3); + rightEdge = max(xPos-3*xThres,xl(1)+datatipWidth); + % is the datatip over cp ? + if rightEdge>xPos && topEdgemin(xdata) & ... + tr.ymin(ydata) & tr.activeNodes) ; +if (strcmp(get(gcbf,'SelectionType'),'normal') && ... + strcmp(get(tr.hdragbox,'visible'),'on')) + tr.selected(:) = false; +end +tr.selected(hp) = true; +tr.sel2root = path2root(tr,tr.selected); +set(tr.hdragbox,'Visible','off') +set(gcbf,'userdata',tr) +updateTree(gcbf,[],[]) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function selectNode(h,varargin) %#ok +% Callback function to select a Node. +% Entry points: from 1) the dots context menu or 2) toggle node +tr = get(gcbf,'userdata'); +% set a virtual grid to get the point +xThres=diff(get(tr.ha,'Xlim'))/100; +yThres=diff(get(tr.ha,'Ylim'))/100; +cp = get(tr.ha,'CurrentPoint'); +xPos = cp(1,1); yPos = cp(1,2); +hp = find(tr.x<(xPos+xThres) & tr.x>(xPos-xThres) & ... + tr.y<(yPos+yThres) & tr.y>(yPos-yThres)) ; + +if numel(hp) + set(tr.hdragbox,'visible','off') + temp = tr.selected(hp(1)); + switch get(gcbf,'SelectionType') + case 'normal'; tr.selected(:) = false; + case 'alt'; if ~temp + tr.selected(:) = false; + end + temp=false; + end + tr.selected(hp(1)) = ~temp; + tr.sel2root = path2root(tr,tr.selected); + set(gcbf,'userdata',tr) + updateTree(gcbf,[],[]) +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function inspectNode(h,varargin) %#ok +% Callback function to inspect the reference Node. +% Entry points: from 1) the dots context menu or 2) toggle node +tr = get(gcbf,'userdata'); +% set a virtual grid to get the point +xThres=diff(get(tr.ha,'Xlim'))/100; +yThres=diff(get(tr.ha,'Ylim'))/100; +cp = get(tr.ha,'CurrentPoint'); +xPos = cp(1,1); yPos = cp(1,2); +hp = find(tr.x<(xPos+xThres) & tr.x>(xPos-xThres) & ... + tr.y<(yPos+yThres) & tr.y>(yPos-yThres)) ; + +if numel(hp) + temp = tr.selected(hp(1)); + tr.selected(:) = false; + tr.selected(hp(1)) = ~temp; + tr.sel2root = path2root(tr,tr.selected); + set(gcbf,'userdata',tr) + updateTree(gcbf,[],[]) +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function collapseExpand(h,varargin) %#ok +% Callback function to Collapse/Expand a branch. +% Entry points: from 1) the dots context menu or 2) toggle node + +tr = get(gcbf,'userdata'); +if strcmp(get(h,'Type'),'uimenu') % come from a context menu + hp = find(tr.selected(tr.numLeaves+1:tr.numLabels)); +else + % set a virtual grid to get the point + xThres=diff(get(tr.ha,'Xlim'))/100; + yThres=diff(get(tr.ha,'Ylim'))/100; + cp = get(tr.ha,'CurrentPoint'); + xPos = cp(1,1); yPos = cp(1,2); + hp = find(tr.x<(xPos+xThres) & tr.x>(xPos-xThres) & ... + tr.y<(yPos+yThres) & tr.y>(yPos-yThres)) ; + hp=hp(hp>tr.numLeaves)-tr.numLeaves; + if numel(hp) + hp=hp(1); %just in case it picked two points + end +end + +if numel(hp) + for ind = 1:numel(hp) + tr.activeBranches(hp(ind))=~tr.activeBranches(hp(ind)); + activeBranches=find(tr.activeBranches)'; + % find active nodes by expanding active Branches + tr.activeNodes(:)=false; + tr.activeNodes(tr.numLabels,1)=true; + for k = activeBranches(end:-1:1) + tr.activeNodes(tr.tree(k,:))=tr.activeNodes(k+tr.numLeaves); + end + tr.selected(:) = false; + tr.selected(hp(ind)+tr.numLeaves) = true; + end + tr.sel2root = path2root(tr,tr.selected); + set(gcbf,'userdata',tr) + updateTree(gcbf,[],hp(end)+tr.numLeaves) +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function rotateBranch(h,varargin) %#ok +% Callback function to rotate a branch reordering the leaves. +% Entry points: from 1) the dots context menu or 2) toggle node +tr = get(gcbf,'userdata'); +if strcmp(get(h,'Type'),'uimenu') % come from a context menu + hp = find(tr.selected(tr.numLeaves+1:tr.numLabels)); +else + % set a virtual grid to get the point + xThres=diff(get(tr.ha,'Xlim'))/100; + yThres=diff(get(tr.ha,'Ylim'))/100; + cp = get(tr.ha,'CurrentPoint'); + xPos = cp(1,1); yPos = cp(1,2); + hp = find(tr.x<(xPos+xThres) & tr.x>(xPos-xThres) & ... + tr.y<(yPos+yThres) & tr.y>(yPos-yThres)) ; + hp=hp(hp>tr.numLeaves)-tr.numLeaves; + if numel(hp) + hp=hp(1); %just in case it picked two points + end +end +if numel(hp) + for ind = 1:numel(hp) + %find Leaves for every child + childrenA = false(1,tr.numLabels); + childrenA(tr.tree(hp(ind),1)) = true; + for k = tr.tree(hp(ind),1)-tr.numLeaves:-1:1 + if childrenA(k+tr.numLeaves) + childrenA(tr.tree(k,:))=true; + end + end + childrenB = false(1,tr.numLabels); + childrenB(tr.tree(hp(ind),2)) = true; + for k = tr.tree(hp(ind),2)-tr.numLeaves:-1:1 + if childrenB(k+tr.numLeaves) + childrenB(tr.tree(k,:))=true; + end + end + permuta = 1:tr.numLabels; + chA = find(childrenA(1:tr.numLeaves)); + chB = find(childrenB(1:tr.numLeaves)); + if chA(1)(xPos-xThres) & ... + tr.y<(yPos+yThres) & tr.y>(yPos-yThres)) ; +if numel(hp) + tr.previousMode = tr.editMode; tr.editMode = 'Renaming'; + xBoxPos = (.02+(xPos-Xlim(1))/diff(Xlim))*aPos(3)+aPos(1); + yBoxPos = (.02+(Ylim(2)-yPos)/diff(Ylim))*aPos(4)+aPos(2); + position=get(gcbf,'position'); + position=[position(3)*xBoxPos position(4)*yBoxPos 150 20]; + set(tr.editBox,'position',position); + set(tr.editBox,'Visible','on','string',tr.names{hp(1)},'Value',hp(1)) + disableAllUI(h) + disableMyContextMenus(h) + set(gcbf,'WindowButtonMotionFcn',[]); % disable windows mouse motion + set(gcbf,'userdata',tr) + end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function doneRenaming(h,varargin) %#ok +% Output helper function to abandon the "Renaming" mode +tr = get(gcbf,'userdata'); +tr.editMode = tr.previousMode; +tr.names{get(tr.editBox,'Value')} = get(tr.editBox,'String'); +set(tr.editBox,'Visible','off') +set(gcbf,'userdata',tr) +updateTree(gcbf,[],[]); +enableAllUI(h) +enableMyContextMenus(h) +enableMyWindowButtonActions(h) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function pruneTree(h,varargin) %#ok +% Callback function to prune the tree, this function is complex because not +% only the basic structure is updated but all the other handles are +% also updated to contain the new tree. + +% if changing to Prune, verify the warning has been accepted +propsForFigure = getappdata(gcbf,'propsForFigure'); +if isequal(propsForFigure.PruneWarning,'NotDone') + warndlg(['Pruning nodes cannot be undone. Before continuing,',... + ' you may want to export the current tree to a new tool.'],... + 'Warning','modal') + setacceptedwarningtoothertools + return % do not do this pruning +end + +tr = get(gcbf,'userdata'); +if strcmp(get(h,'Type'),'uimenu') % comes from a context menu + hp = find(tr.selected); +else + tr = get(gcbf,'userdata'); + Xlim=get(tr.ha,'Xlim');Ylim=get(tr.ha,'Ylim'); + % aPos=get(tr.ha,'Position'); + % set a virtual grid to get the point + xThres=diff(Xlim)/100; + yThres=diff(Ylim)/100; + cp = get(tr.ha,'CurrentPoint'); + xPos = cp(1,1); yPos = cp(1,2); + hp = find(tr.x<(xPos+xThres) & tr.x>(xPos-xThres) & ... + tr.y<(yPos+yThres) & tr.y>(yPos-yThres)); + hp=hp(1); %just in case it picked two points +end + +hp(hp==tr.numLabels)=[]; %cannot delete the root + +while numel(hp) + %find all nodes to purge (i.e. all descendants) + children = false(1,tr.numLabels); + children(hp(1)) = true; + for k = hp(1)-tr.numLeaves:-1:1 + if children(k+tr.numLeaves) + children(tr.tree(k,:))=true; + end + end + mypar = tr.par(hp(1)); % parent + if mypar < tr.numLabels % my parent is NOT the root + % connect brother to granparent + mygrpar = tr.par(mypar); % grandparent + myuncle = setxor(tr.tree(mygrpar-tr.numLeaves,:),mypar); % uncle + mybro = setxor(tr.tree(mypar-tr.numLeaves,:),hp(1)); % brother + tr.tree(mygrpar-tr.numLeaves,:) = [myuncle mybro]; + tr.dist(mybro) = tr.dist(mybro) + tr.dist(mypar); + temp = get(tr.hlines(mygrpar-tr.numLeaves),'Xdata'); + temp([1 4])=tr.x(tr.tree(mygrpar-tr.numLeaves,:)); + set(tr.hlines(mygrpar-tr.numLeaves),'Xdata',temp); + highlight = [mybro,mygrpar]; + else % if my parent is the root, now I am the new root + temp=cell2mat(get(tr.hlines,'Xdata'))-tr.dist(end); + for k = 1:tr.numBranches + set(tr.hlines(k),'Xdata',temp(k,:)); + end + highlight = setxor(tr.tree(mypar-tr.numLeaves,:),hp(1)); + end + children(mypar) = true; %also delete my par + % find indexes to change tree + permuta = 1:tr.numLabels; + permuta(children) = []; + ipermuta = zeros(1,tr.numLabels); + ipermuta(permuta) = 1:length(permuta); + permutaBranches = permuta(permuta>tr.numLeaves)-tr.numLeaves; + % update all tree structure fields + tr.names = tr.names(permuta); + tr.dist = tr.dist(permuta); + tr.tree = tr.tree(permutaBranches,:); + tr.tree = ipermuta(tr.tree); + if isempty(tr.tree) + return; + end % one leaf, no branches ! + tr = doBasicCalculations(tr); + hlines = tr.hlines; + tr.hlines = tr.hlines(permuta); + delete(setxor(hlines,tr.hlines)); + tr.activeNodes = tr.activeNodes(permuta); + tr.activeBranches = tr.activeBranches(permutaBranches); + tr.selected = tr.selected(permuta); + tr.selected(:) = false; + tr.selected(ipermuta(highlight)) = true; + tr.sel2root = path2root(tr,tr.selected); + % update the vector with nodes to prune (node index has changed) + hp=ipermuta(hp); + hp(1)=[]; + hp(hp==0)=[]; + set(gcbf,'userdata',tr) + updateTree(gcbf,[],[]) +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function colorDown(h,varargin) %#ok +% Color Down a branch. +% Entry points: from 1) the dots context menu or 2) toggle node +disp('Color Down a branch. Not implemented YET !!!') + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function findNode(h,varargin) %#ok +treefig = gcbf; +tr = get(treefig,'userdata'); +s = inputdlg('Regular Expression to match ?','Find Leaf/Branch',1); +if ~isempty(s) + hc=regexpi(regexprep(tr.names,'_',' '),s); + h = false(1,tr.numLabels); + for ind = 1:tr.numLabels + if ~isempty(hc{ind}) + h(ind)=true; + end + end + hf = find(h); + for ind = 1:length(hf) + while ~tr.activeNodes(hf(ind)) + hf(ind)=tr.par(hf(ind)); + end + end + tr.selected(:) = false; + tr.selected(hf) = true; + tr.sel2root = path2root(tr,tr.selected); % update path to root + set(treefig,'Userdata',tr); + updateTree(treefig,[],[]) + + % if selected are out of current view then fit the tree + if (any(min(ylim(tr.ha))>tr.y(tr.selected)) || ... + any(max(ylim(tr.ha))= Value; + set(tr.slidertx,'String',num2str(Value)) +end +set(gcbf,'Userdata',tr); +toshow = [tr.tocollapse(tr.par(1:tr.numLabels-1));0]&tr.tocollapse; +mask = (1:tr.numLabels)'>tr.numLeaves; +% update light lines +set(tr.hlines(~toshow),'color','k') +set(tr.hlines(toshow),'color',[.87 .87 .87]) +% update light dots +set(tr.hldots(1),'Ydata',tr.y(toshow&mask&tr.activeNodes),... + 'Xdata',tr.x(toshow&mask&tr.activeNodes)) +set(tr.hldots(2),'Ydata',tr.y(toshow&~mask&tr.activeNodes),... + 'Xdata',tr.x(toshow&~mask&tr.activeNodes)) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function doThresholdCut(h,varargin) %#ok +% this helper function inactivates nodes based on the threshold cut selected +% with the slider. Entry point: the only way to get into this function is +% by the 'OK' uicontrol next to the slider. +tr = get(gcbf,'userdata'); +tr.activeBranches = tr.activeBranches & ~tr.tocollapse(tr.numLeaves+1:tr.numLabels); +% find active nodes by expanding active Branches +activeBranches=find(tr.activeBranches)'; +tr.activeNodes(:)=false; +tr.activeNodes(tr.numLabels,1)=true; +for ind = activeBranches(end:-1:1) + tr.activeNodes(tr.tree(ind,:))=tr.activeNodes(ind+tr.numLeaves); +end +tr.lastThresholdValue = get(tr.slider,'Value'); +set([tr.slider,tr.slidertx,tr.sliderok],'Visible','off') +tr.selected(:) = false; +tr.sel2root = path2root(tr,tr.selected); +set(gcbf,'userdata',tr) +changeEditMode(h); +updateTree(gcbf,[],[]); +autoFit(h) +enableAllUI(h) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function cancelThresholdCut(h,varargin) %#ok +% this helper function cancels the threshold cut mode and returns to the +% 'select' mode. Entry point: the ways to get into this function is +% by the 'CANCEL' uicontrol next to the slider (does not exist yet) or by +% mouse click over the axes diring the slider mode. +tr = get(gcbf,'userdata'); +set([tr.slider,tr.slidertx,tr.sliderok],'Visible','off') +changeEditMode(h); +updateTree(gcbf,[],[]); +enableAllUI(h) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function expandAll(h,varargin) %#ok +% Callback function to expand all hidden nodes +tr = get(gcbf,'userdata'); +x=[0 inf]; +[dump,anchor]=min(abs((mean(ylim)-tr.y))+x(1+tr.activeNodes)'); %#ok +tr.activeBranches(:) = true; +tr.activeNodes(:) = true; +set(gcbf,'Userdata',tr); +updateTree(gcbf,[],anchor) +autoFit(h) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Callback function to save tree +function saveNewick(h,varargin) %#ok +tr = get(gcbf,'userdata'); +newtr.tree = tr.tree; +newtr.dist = tr.dist; +newtr.names = tr.names; +phytreewrite(phytree(newtr),'GUI',true); + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Callback function to restore the original tree +function restoreTree(h,varargin) %#ok +tr = getappdata(gcbf,'backupTree'); +view(phytree(tr)) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Callback function to load tree +function loadNewick(h,varargin) %#ok +if strcmp(get(h,'Type'),'uimenu') % if caller is the uimenu then needs + figtoclose = gcbf; + tr = phytreeread; % to pick a file + +else % if not, caller is the callback from get workspace var + tr=[]; + pfig = get(h,'Parent'); + if strcmp(get(h,'string'),'Import') || ... + (strcmp(get(h,'style'),'listbox') && strcmp(get(gcbf,'SelectionType'),'open')) + hp = get(pfig,'Userdata'); + figtoclose = hp(4); + ops = get(hp(1),'string'); + if ~isempty(ops) + tr = evalin('base',ops(get(hp(1),'value'),:)); + end + close(pfig); + elseif strcmp(get(h,'string'),'Cancel') + close(pfig); + end +end +if ~isempty(tr) + propsForFigure = getappdata(figtoclose,'propsForFigure'); + view(tr,[],propsForFigure); + close(figtoclose) +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Callback function to close tree +function closeNewick(h,varargin) %#ok +close(gcbf) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Callback function to copy the tree to a figure +function doPublishFigure(h,varargin,AllNodes,callerIsContextMenu) %#ok + +pfig = get(h,'Parent'); +if strcmp(get(h,'string'),'Cancel') + close(pfig); + return; +end + +% get options from the publishdlg window and close it +hp = get(pfig,'Userdata'); +vfig = hp(7); +va = get(hp(1:6),'Value'); +va = [va{:}]; +switch find(va(1:3)) + case 1; args = {'type','square'}; + case 2; args = {'type','angular'}; + case 3; args = {'type','radial'}; +end +args = {args{:},'bra',va(4),'lea',va(5),'ter',va(6)}; +close(pfig); + +% now select the branch to publish based on selected points +tr = get(vfig,'userdata'); +if ~callerIsContextMenu & ~tr.selected %#ok % if called from the uimenu and + tr.selected(end) = true; % nothing is selected pick the root +end +selected = find(tr.selected); +commonpath = true(tr.numLabels,1); +for ind = 1:numel(selected) + commonpath = commonpath & path2root(tr,selected(ind)); +end +branchtoexp = find(commonpath,1); +tr.selected(:) = false; +tr.selected(branchtoexp) = true; +set(vfig,'userdata',tr) +updateTree(vfig,[],[]); + +hp = branchtoexp; +%find all nodes to export (i.e. all descendants) +children = false(1,tr.numLabels); +children(hp) = true; +if AllNodes + for ind = hp-tr.numLeaves:-1:1 + if children(ind+tr.numLeaves) + children(tr.tree(ind,:))=true; + end + end + permuta = find(children); +else + for ind = hp-tr.numLeaves:-1:1 + if children(ind+tr.numLeaves) + children(tr.tree(ind,:))=tr.activeNodes(tr.tree(ind,:)); + end + end + braToLea = find(~tr.activeNodes(tr.tree(:,1)))+tr.numLeaves; + expBran = find(children(tr.numLeaves+1:end)) + tr.numLeaves; + permuta = [find(children(1:tr.numLeaves)) ... + intersect(expBran,braToLea)]; + [dump,hs] = sort(tr.y(permuta)); %#ok + permuta = [permuta(hs) setdiff(expBran,braToLea)]; +end + +if sum(children)>1 % enough leaves to export ? + newtr = phytree; + ipermuta(permuta) = 1:length(permuta); + numLeaves = (ipermuta(end) + 1)/2; + newtr.tree = ipermuta(tr.tree(permuta(numLeaves+1:end)-tr.numLeaves,:)); + newtr.dist = tr.dist(permuta); + newtr.names = tr.names(permuta); + plot(newtr,true(length(newtr.tree),1),args{:}) +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Callback function for autofit +function autoFit(h,varargin) %#ok +tr = get(gcbf,'userdata'); +set(tr.ha,'Ylim',[min(tr.y(tr.activeNodes))-1,max(tr.y(tr.activeNodes))+1]); + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Callback function to Reset view +function myResetView(h,varargin) %#ok +tr = get(gcbf,'userdata'); +set(tr.ha,'Ylim',tr.yLim); +set(tr.ha,'Xlim',tr.xLim); + + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Common export function +function exportSubtree(h,varargin,AllNodes,ToWS,callerIsContextMenu) %#ok + +tr = get(gcbf,'userdata'); +% if called from the uimenu and nothing is selected pick the root +if ~callerIsContextMenu & ~tr.selected %#ok + tr.selected(end) = true; +end + +selected = find(tr.selected); +commonpath = true(tr.numLabels,1); +for ind = 1:numel(selected) + commonpath = commonpath & path2root(tr,selected(ind)); +end +branchtoexp = find(commonpath,1); +tr.selected(:) = false; +tr.selected(branchtoexp) = true; + +set(gcbf,'userdata',tr) +updateTree(gcbf,[],[]); +doExport(branchtoexp,AllNodes,ToWS) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Common export function after having selected a point +function doExport(hp,AllNodes,ToWS) + +tr = get(gcbf,'userdata'); + +%find all nodes to export (i.e. all descendants) +children = false(1,tr.numLabels); +children(hp) = true; +if AllNodes + for ind = hp-tr.numLeaves:-1:1 + if children(ind+tr.numLeaves) + children(tr.tree(ind,:))=true; + end + end + permuta = find(children); +else + for ind = hp-tr.numLeaves:-1:1 + if children(ind+tr.numLeaves) + children(tr.tree(ind,:))=tr.activeNodes(tr.tree(ind,:)); + end + end + braToLea = find(~tr.activeNodes(tr.tree(:,1)))+tr.numLeaves; + expBran = find(children(tr.numLeaves+1:end)) + tr.numLeaves; + permuta = [find(children(1:tr.numLeaves)) ... + intersect(expBran,braToLea)]; + [dump,hs] = sort(tr.y(permuta)); %#ok + permuta = [permuta(hs) setdiff(expBran,braToLea)]; +end +if sum(children)>1 % enough leaves to export ? + newtr=phytree; + ipermuta(permuta) = 1:length(permuta); + numLeaves = (ipermuta(end) + 1)/2; + newtr.tree = ipermuta(tr.tree(permuta(numLeaves+1:end)-tr.numLeaves,:)); + newtr.dist = tr.dist(permuta); + newtr.names = tr.names(permuta); + if ToWS % export to workspace ? + s = inputdlg('Workspace variable name ?','Export to Workspace',1); + while ~(isempty(s) || isvarname(s{1}) || isempty(s{1})) + s = inputdlg('Not a valid variable name, type a MATLAB variable name ?','Export to Workspace',1); + end + if ~(isempty(s) || isempty(s{1})) + assignin('base',s{1},newtr) + end + else % no, then export to other viewer + view(newtr); + end +end + + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function updateTree(h,highLight,anchor) %#ok +% Redraws the tree depending on the active Branches +% rather than erase and redraw, we only change specific fields in hlines +% and hdots. + +tr = get(h,'userdata'); +activeBranches=find(tr.activeBranches)'; +oldPos = tr.y(anchor); + +% propagate last leaf +lastleaf = 1:tr.numLabels; +for ind = tr.numBranches:-1:1 + if ~tr.activeNodes(tr.tree(ind,1)) + lastleaf(tr.tree(ind,:))=lastleaf(ind+tr.numLeaves); + end +end + +% find x coordinates of branches +tr.x = tr.dist; +for ind = tr.numBranches:-1:1 + tr.x(tr.tree(ind,:)) = tr.x(tr.tree(ind,:)) + tr.x(ind+tr.numLeaves); +end + +% find y coordinates of branches +dummy = lastleaf([true,diff(lastleaf(1:tr.numLeaves))~=0]); +tr.y=zeros(tr.numLabels,1); +tr.y(dummy)=1:length(dummy); +for ind = activeBranches + tr.y(ind+tr.numLeaves) = mean(tr.y(tr.tree(ind,:))); +end + +% update right labels +todis = tr.names(dummy); +set(tr.ha,'ytick',1:length(dummy),'yticklabel',todis) + +% show only active branches +set(tr.hlines,'Visible','off') +set(tr.hlines(tr.activeNodes),'Visible','on') + +% update coordinates in lines +for ind = 1:tr.numLabels-1 + set(tr.hlines(ind),'Ydata',tr.y([ind,ind,tr.par(ind)])) + set(tr.hlines(ind),'Xdata',tr.x([ind,tr.par([ind ind])])) +end +set(tr.hlines(tr.numLabels),'Ydata',tr.y(tr.numLabels)*[1 1 1]) +set(tr.hlines(tr.numLabels),'Xdata',tr.x(tr.numLabels)*[1 1 1]) + +% update dots +mask = false(tr.numLabels,1); mask(1:tr.numLeaves) = true; +set(tr.hdots(1),'Ydata',tr.y(tr.activeNodes&~mask),'Xdata',tr.x(tr.activeNodes&~mask)) +set(tr.hdots(2),'Ydata',tr.y(tr.activeNodes&mask),'Xdata',tr.x(tr.activeNodes&mask)) + +% update red dots +set(tr.hseldots(1),'Ydata',tr.y(tr.activeNodes&~mask&tr.selected),'Xdata',tr.x(tr.activeNodes&~mask&tr.selected)) +set(tr.hseldots(2),'Ydata',tr.y(tr.activeNodes&mask&tr.selected),'Xdata',tr.x(tr.activeNodes&mask&tr.selected)) + +% set the axis holders +set(tr.axhold,'Ydata',[0.5,max(tr.y(tr.activeNodes))+0.5]) +if numel(oldPos) + set(tr.ha,'ylim',get(tr.ha,'ylim')+tr.y(anchor)-oldPos); +else + set(tr.ha,'ylim',get(tr.ha,'ylim')) % just touch 'YLim' such that the listener is triggered +end + +% turn on indicative modes +tr.indicativeMode = false; +set([tr.datatip tr.hpathline],'visible','off') +set(tr.hlines,'color','black') +set(tr.hldots(1),'Xdata',[],'Ydata',[]) +set(tr.hldots(2),'Xdata',[],'Ydata',[]) + +% save figure data +set(h,'Userdata',tr) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function path2r = path2root(tr,from) +% helper function, finds path to root +path2r = false(tr.numLabels,1); +if (numel(from)~=1 && sum(from)~=1) + return; +end +path2r(from) = true; +temp = find(path2r); +if numel(temp) + while temp~=tr.numLabels; + temp = tr.par(temp); + path2r(temp) = true; + end +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function [pathA,pathB] = path2sel(tr,from) +% helper function, finds path to selected node +path2rt = path2root(tr,from); +commonPath = tr.sel2root & path2rt; +commonPath(find(commonPath,1)) = false; +pathB = tr.sel2root & ~commonPath; +pathA = path2rt & ~commonPath; + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function tr = doBasicCalculations(tr) +% helper function to compute and find some features of the tree +tr = struct(tr); +tr.numBranches = size(tr.tree,1); +tr.numLeaves = tr.numBranches + 1; +tr.numLabels = tr.numBranches + tr.numLeaves; + +% obtain parents for every node +tr.par(tr.tree(:)) = tr.numLeaves + [1:tr.numBranches 1:tr.numBranches]; + +% calculate the distance to the closest leaf for every node +% needed for fast threshold cut +tr.dist2Leaf = zeros(tr.numLabels,1); +for ind = 1:tr.numBranches + tr.dist2Leaf(ind+tr.numLeaves) = ... + min(tr.dist2Leaf(tr.tree(ind,:))+tr.dist(tr.tree(ind,:))); +end + +% calculate drawing coordinates for the tree: x coordinated will never +% change, but y coordinates may change depending on the active branches and +% nodes. +tr.x = tr.dist; tr.y=[1:tr.numLeaves zeros(1,tr.numBranches)]'; +for ind = tr.numBranches:-1:1 + tr.x(tr.tree(ind,:)) = tr.x(tr.tree(ind,:)) + tr.x(ind+tr.numLeaves); +end +for ind =1:tr.numBranches + tr.y(ind+tr.numLeaves) = mean(tr.y(tr.tree(ind,:))); +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function correctFigureSize(fig,recommendedHeight) +% helper function to increase initial figure size depending on the screen & +% tree sizes +screenSize = diff(reshape(get(0,'ScreenSize'),2,2),[],2)-100; + % 100 gives extra space for the figure header and win toolbar +position = get(fig,'Position'); +if recommendedHeight > position(4) + if recommendedHeight < sum(position([2 4])) + position(2) = sum(position([2 4])) - recommendedHeight; + position(4) = recommendedHeight; + elseif recommendedHeight < screenSize(2) + position(2) = 30; + position(4) = recommendedHeight; + else + position(2) = 30; + position(4) = screenSize(2); + end + set(fig,'Position',position) +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function hToggleToolbar = makePhyTreeViewerToolbar(fig) +% helper function to set the toolbar +% +% hToggleToolbar contains handles to easy change the state on/off when +% changing modes + +oldSH = get(0,'ShowHiddenHandles'); +set(0,'ShowHiddenHandles','on') + +set(fig,'toolbar','figure') % needs to update because uicontrols turn it off + +% Fix toolbar options, we keep: ZoomIn,ZoomOut,Pan +hw = findall(fig,'type','uitoolbar'); +hf = get(hw,'Children'); +h1 = findall(hf,'Tag','Exploration.Pan'); +h2 = findall(hf,'Tag','Exploration.ZoomOut'); +h3 = findall(hf,'Tag','Exploration.ZoomIn'); +delete(setxor(hf,[h1,h2,h3])) +set([h1 h2 h3],'Separator','off','clickedCallback',@changeEditMode); + +% load icons +load(fullfile(matlabroot,'toolbox','bioinfo','bioinfo','@phytree','phytreeicons')) + +h4 = uitoggletool('ToolTip','Inspect Tool Mode','separator','on',... + 'Tag','Inspect', 'CData',icons(:,:,1:3)); %#ok +h5 = uitoggletool('ToolTip','Collapse/Expand Branch Mode',... + 'Tag','Collapse/Expand','CData',icons(:,:,4:6)); %#ok +h6 = uitoggletool('ToolTip','Rotate Branch Mode',... + 'Tag','Rotate Branch', 'CData',icons(:,:,7:9)); %#ok +h7 = uitoggletool('ToolTip','Rename Leaf/Branch Mode',... + 'Tag','Rename', 'CData',icons(:,:,10:12)); %#ok +h8 = uitoggletool('ToolTip','Prune (delete) Leaf/Branch Mode',... + 'Tag','Prune', 'CData',icons(:,:,13:15)); %#ok +set([h4 h5 h6 h7 h8],'clickedCallback',@changeEditMode,'state','off',... + 'Serializable','off','HandleVisibility','off'); +hToggleToolbar = [h4 h5 h6 h7 h8 h1 h2 h3]; +set(0,'ShowHiddenHandles',oldSH) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function hToggleUIMenu = makePhyTreeViewerUIMenus(fig) +% helper function to set UI menus +% +% hToggleUIMenu contains handles to easy check on/off modes in the UI menu + +oldSH = get(0,'ShowHiddenHandles'); +set(0,'ShowHiddenHandles','on') + +% delete figure menus not used +h1 = findall(fig,'Type','uimenu', 'Label','&Edit'); +h2 = findall(fig,'Type','uimenu', 'Label','&View'); +h3 = findall(fig,'Type','uimenu', 'Label','&Insert'); +h4 = findall(fig,'Type','uimenu', 'Label','&Desktop'); +delete([h1,h2,h3,h4]) + +% Repair "File" menu +hw = findall(fig,'Type','uimenu', 'Label','&File'); +hf = get(hw,'children'); +h1 = findall(hw,'Label','Expo&rt Setup...'); +h3 = findall(hw,'Label','Print Pre&view...'); +h4 = findall(hw,'Label','&Print...'); +delete(setxor(hf,[h1,h3,h4])) +uimenu(hw,'Label','New Tool...', 'Position',1,'Callback','phytreetool') +uimenu(hw,'Label','Open...', 'Position',2,'Callback',@loadNewick) +uimenu(hw,'Label','Import from Workspace...','Position',3,'Callback',@importfromwsdlg) +uimenu(hw,'Label','Open Original in New Tool','Position',4,'Callback',@restoreTree) +uimenu(hw,'Label','Save As...', 'Position',5,'Callback',@saveNewick,'Separator','on') +item0 = uimenu(hw ,'Label','Print to Figure','Position',6); + uimenu(item0,'Label','With Hidden Nodes...','Callback',{@publishdlg,1,0}); + uimenu(item0,'Label','Only Displayed...', 'Callback',{@publishdlg,0,0}); +item1 = uimenu(hw,'Label','Export to New Tool','Position',7); + uimenu(item1,'Label','With Hidden Nodes...','Callback',{@exportSubtree,1,0,0}); + uimenu(item1,'Label','Only Displayed...', 'Callback',{@exportSubtree,0,0,0}); +item2 = uimenu(hw,'Label','Export to Workspace','Position',8); + uimenu(item2,'Label','With Hidden Nodes...','Callback',{@exportSubtree,1,1,0}); + uimenu(item2,'Label','Only Displayed...', 'Callback',{@exportSubtree,0,1,0}); +uimenu(hw,'Label','Exit','Separator','on','Position',12,'Callback',@closeNewick) +set(h1,'Separator','on') + +% Repair "Tools" menu +hw = findall(fig,'Type','uimenu','Label','&Tools'); +hf = get(hw,'children'); +h1 = findall(hw,'Tag','figMenuZoomIn'); set(h1,'Callback',{@changeEditMode,gcbo}); +h2 = findall(hw,'Tag','figMenuZoomOut'); set(h2,'Callback',{@changeEditMode,gcbo}); +h3 = findall(hw,'Tag','figMenuPan'); set(h3,'Callback',{@changeEditMode,gcbo}); +h4 = findall(hw,'Tag','figMenuResetView'); set(h4,'Callback',{@myResetView,gcbo}); +h5 = findall(hw,'Tag','figMenuOptions'); +set([h1,h4],'separator','off') +delete(setxor(hf,[h1,h2,h3,h4,h5])) +delete(findall(h5,'Tag','figMenuOptionsDatatip')) +delete(findall(h5,'Tag','figMenuOptionsDataBar')) +h6 = uimenu(hw,'Label','Inspect','Position',1,'Callback',@changeEditMode); +h7 = uimenu(hw,'Label','Collapse/Expand','Position',2,... + 'Callback',@changeEditMode); +h8 = uimenu(hw,'Label','Rotate Branch','Position',3,... + 'Callback',@changeEditMode); +h9 = uimenu(hw,'Label','Rename','Position',4, 'Callback',@changeEditMode); +h10 = uimenu(hw,'Label','Prune','Position',5, 'Callback',@changeEditMode); +item3 = uimenu(hw,'Label','Threshold Collapse','Position',9,'Separator','on'); + uimenu(item3,'Label','Distance to Leaves', 'Callback',@thresholdCut); + uimenu(item3,'Label','Distance to Root', 'Callback',@thresholdCut); + uimenu(hw,'Label','Expand All','Position',10,'Callback',@expandAll); +uimenu(hw,'Label','Find Leaf/Branch...','Position',11,'Callback',@findNode); +uimenu(hw,'Label','Fit to Window','Position',12,'Separator','on',... + 'Callback',@autoFit); +set(h1,'Separator','on') + +% Repair "Help" menu +hw = findall(fig,'Type','uimenu','Label','&Help'); +delete(get(hw,'children')); +uimenu(hw,'Label','Bioinformatics Toolbox Help','Position',1,'Callback',... + 'web([docroot ''/toolbox/bioinfo/bioinfo_product_page.html''])') +uimenu(hw,'Label','Phylogenetic Tree Tool Help','Position',2,'Callback',... + ['helpview(fullfile(docroot,''toolbox'',''bioinfo'', ''bioinfo.map'')' ... + ',''phytreetool_reference'')' ] ) +uimenu(hw,'Label','Demos','Position',3,'Separator','on',... + 'Callback','demo(''toolbox'',''bioinfo'')') +tlbx = ver('bioinfo'); +mailstr = ['web(''mailto:bioinfofeedback@mathworks.com?subject=',... + 'Feedback%20for%20Phytreetool%20in%20Bioinformatics',... + '%20Toolbox%20',tlbx(1).Version,''')']; +uimenu(hw,'Label','Send Feedback','Position',4,'Separator','on',... + 'Callback',mailstr); +set(0,'ShowHiddenHandles',oldSH) +hToggleUIMenu = [h6 h7 h8 h9 h10 h1 h2 h3]; + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function [hToggleContextMenu,hAxisContextMenu,hDotsContextMenu] = ... + makePhyTreeViewerContextMenus(fig) %#ok +% helper function to set context menus +% +% hToggleContextMenu contains handles to easy check on/off modes in the +% Context Menu +% hAxisContextMenu contains handle to the Axis Context Menu to used to +% reactivate it +% hDotsContextMenu contains handle to the Dots Context Menu to used to +% reactivate it + +% set up context menu for the axes (when mouse is not over a node) +hcm1 = uicontextmenu('Callback',@hideActiveIndicators); + +h1 = uimenu(hcm1,'Label','Inspect', 'Callback',@changeEditMode); +h2 = uimenu(hcm1,'Label','Collapse/Expand','Callback',@changeEditMode); +h3 = uimenu(hcm1,'Label','Rotate Branch', 'Callback',@changeEditMode); +h4 = uimenu(hcm1,'Label','Rename', 'Callback',@changeEditMode); +h5 = uimenu(hcm1,'Label','Prune', 'Callback',@changeEditMode); +item4 = uimenu(hcm1 , 'Label', 'Threshold Collapse','Separator','on'); + uimenu(item4,'Label','Distance to Leaves', 'Callback',@thresholdCut); + uimenu(item4,'Label','Distance to Root', 'Callback',@thresholdCut); + uimenu(hcm1,'Label','Expand All','Callback',@expandAll); +uimenu(hcm1 ,'Label','Find Leaf/Branch...', 'Callback',@findNode); +uimenu(hcm1 ,'Label','Fit to Window','Separator','on','Callback',@autoFit); +uimenu(hcm1 ,'Label','Reset to Original View', 'Callback',@myResetView); + +% context menu for dots (when mouse over a dot and right mouse button pressed) +hcm2 = uicontextmenu('Callback',@enableOptionsContextMenu); + uimenu(hcm2,'Label','Collapse/Expand', 'Callback',@collapseExpand); + uimenu(hcm2,'Label','Rotate Branch', 'Callback',@rotateBranch); + uimenu(hcm2,'Label','Rename', 'Callback',@renameNode); + uimenu(hcm2,'Label','Prune', 'Callback',@pruneTree); +item0 = uimenu(hcm2,'Label','Print to Figure','Separator','on'); + uimenu(item0,'Label','With Hidden Nodes...','Callback',{@publishdlg,1,1}); + uimenu(item0,'Label','Only Displayed...', 'Callback',{@publishdlg,0,1}); +item1 = uimenu(hcm2,'Label','Export to New Tool'); + uimenu(item1,'Label','With Hidden Nodes...','Callback',{@exportSubtree,1,0,1}); + uimenu(item1,'Label','Only Displayed...', 'Callback',{@exportSubtree,0,0,1}); +item2 = uimenu(hcm2,'Label','Export to Workspace'); + uimenu(item2,'Label','With Hidden Nodes...','Callback',{@exportSubtree,1,1,1}); + uimenu(item2,'Label','Only Displayed...', 'Callback',{@exportSubtree,0,1,1}); + %uimenu(hcm2,'Label','Color Down', 'Callback',@colorDown); + +% save context menus in my data structure to later restore them if desired +hToggleContextMenu = [h1 h2 h3 h4 h5]; +hAxisContextMenu = hcm1; +hDotsContextMenu = hcm2; + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function enableOptionsContextMenu(h,varargin) +hideActiveIndicators +selectNode(h,varargin) +tr = get(gcbf,'Userdata'); +hc = get(tr.hDotsContextMenu,'Children'); +set(hc,'Enable','on') +if sum(tr.selected)~=1 + set(hc(5),'Enable','off'); +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function setupYLabelsListeners +% helper function to setsup listeners for the ylables, so we can detect if +% we would need to change the fontsize +hgp = findpackage('hg'); +axesC = findclass(hgp,'axes'); +figureC = findclass(hgp,'figure'); +% listens when the Ylim of axes has changed +YLimListener = handle.listener(gca,axesC.findprop('YLim'),... + 'PropertyPostSet',{@ylabelsListener,gcf,gca}); +% listens when Position of Figure has changed +PositionListener = handle.listener(gcf,figureC.findprop('Position'),... + 'PropertyPostSet',{@ylabelsListener,gcf,gca}); +% store the listeners +setappdata(gcf,'PhyTreeListeners',[YLimListener, PositionListener]); + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function publishdlg(h,varargin,AllNodes,callerIsContextMenu) %#ok +% dialog window to select options for publishing +vfig = gcbf; +c = get(0,'ScreenSize')*[1 0;0 1;.5 0;0 .5]; +fig = figure('WindowStyle','modal','Color',[0.831373 0.815686 0.784314],... + 'Position',[c-[150 100] 300 200],'Resize','off','NumberTitle','off',... + 'Name','Print Phylogenetic Tree to Figure','IntegerHandle','off' ); +h1=uibuttongroup;h2=uibuttongroup; +set(h1,'Position',[.08 .35 .35 .55],'Title','Rendering Type','backgroundcolor',[0.831373 0.815686 0.784314]) +set(h2,'Position',[.52 .35 .39 .55],'Title','Display Labels','backgroundcolor',[0.831373 0.815686 0.784314]) +ui1=uicontrol(h1,'style','radiobutton','Position',[5 70 90 20],'string','Square','value',1,'backgroundcolor',[0.831373 0.815686 0.784314]); +ui2=uicontrol(h1,'style','radiobutton','Position',[5 40 90 20],'string','Angular','backgroundcolor',[0.831373 0.815686 0.784314]); +ui3=uicontrol(h1,'style','radiobutton','Position',[5 10 90 20],'string','Radial','backgroundcolor',[0.831373 0.815686 0.784314]); +ui4=uicontrol(h2,'style','checkbox','Position',[5 70 109 20],'string','Branch Nodes','backgroundcolor',[0.831373 0.815686 0.784314]); +ui5=uicontrol(h2,'style','checkbox','Position',[5 40 109 20],'string','Leaf Nodes','backgroundcolor',[0.831373 0.815686 0.784314]); +ui6=uicontrol(h2,'style','checkbox','Position',[5 10 109 20],'string','Terminal Nodes','value',1,'backgroundcolor',[0.831373 0.815686 0.784314]); +uicontrol(fig,'style','pushbutton','Position',[70 20 60 30],'string','Print','Callback',{@doPublishFigure,AllNodes,callerIsContextMenu}); +uicontrol(fig,'style','pushbutton','Position',[155 20 60 30],'string','Cancel','Callback',{@doPublishFigure,AllNodes,callerIsContextMenu}); +set(fig,'Userdata',[ui1 ui2 ui3 ui4 ui5 ui6 vfig]); +set(h1,'SelectionChangeFcn',{@toggleCheckBoxs,ui3,ui6}) + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function toggleCheckBoxs(h,event,h3,h6) %#ok +if get(h3,'value') + set(h6,'enable','off') +else + set(h6,'enable','on') +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function importfromwsdlg(h,varargin) %#ok +% dialog window to select variable from workspace +mvars = evalin('base','whos'); +mvars = mvars(strmatch('phytree',{mvars(:).class})); %#ok +vfig = gcbf; +c = get(0,'ScreenSize')*[1 0;0 1;.5 0;0 .5]; +fig = figure('WindowStyle','modal','Color',[0.831373 0.815686 0.784314],... + 'Position',[c-[80 100] 160 220],'Resize','off','NumberTitle','off',... + 'Name','Get Phytree Object','IntegerHandle','off' ); +ui1=uicontrol(fig,'style','list','Position',[22 70 120 120],'string',char(mvars(:).name),'backgroundcolor','w','Callback',@loadNewick); +ui2=uicontrol(fig,'style','pushbutton','Position',[15 20 60 30],'string','Import','Callback',@loadNewick); +ui3=uicontrol(fig,'style','pushbutton','Position',[90 20 60 30],'string','Cancel','Callback',@loadNewick); +uicontrol(fig,'style','text','Position',[20 190 140 20],'string','Select phytree object:','Horizontal','left','backgroundcolor',[0.831373 0.815686 0.784314]) +set(fig,'Userdata',[ui1 ui2 ui3 vfig]); + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function n = getphytreetoolnumber() +% Computes the index number for this particular tool + +% first, finds the used numbers so far +allFigs = findall(0,'tag','PhyTreeTool'); +usedNumbers = zeros(1,numel(allFigs)+1); +baseName = 'Phylogenetic Tree Tool '; +baseLen = length(baseName); +for i = 1:numel(allFigs) + str = get(allFigs(i),'Name'); + usedNumbers(i) = str2double(str(baseLen:end)); +end + +% This is how we find the next index. The rule is that we find the lowest +% integer value (non-zero and positive) not yet prescribed to a phytree +% tool, This is the same way MATLAB figures behave. +n = num2str(min(setdiff(1:(max(usedNumbers)+1),usedNumbers))); + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function n = getacceptedwarningfromothertools(cfig) %#ok +% Finds out if the pruning warning has been accepted +% first, finds the used numbers so far +allFigs = findall(0,'tag','PhyTreeTool'); +n = 'NotDone'; +for i = 1:numel(allFigs) + propsForFigure = getappdata(allFigs(i),'propsForFigure'); + if isequal(propsForFigure.PruneWarning,'Done') + n = 'Done'; + end +end + +%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function setacceptedwarningtoothertools() +% Set the pruning warning to all the other open tools as 'Done' +allFigs = findall(0,'tag','PhyTreeTool'); +for i = 1:numel(allFigs) + propsForFigure = getappdata(allFigs(i),'propsForFigure'); + propsForFigure.PruneWarning = 'Done'; + setappdata(allFigs(i),'propsForFigure',propsForFigure); +end + diff --git a/matlab/graph/@phyTree/Weights.m b/matlab/graph/@phyTree/Weights.m new file mode 100644 index 0000000..70c84b6 --- /dev/null +++ b/matlab/graph/@phyTree/Weights.m @@ -0,0 +1,54 @@ +function W = weights(tr) +%WEIGHTS Tree based sequence weights. +% +% W = WEIGHTS(T) Calculates branch proportional weights for every leaf in +% the tree using the Thompson-Higgins-Gibson method. The distance of every +% segment of the tree is adjusted by dividing it by the number of leaves +% it contains. The sequence weights are the result of normalizing to the +% unity the new patristic distances between every leaf and the root. +% +% Example: +% +% % Create an ultrametric tree with specified branch distances +% bd = [1 2 3]'; +% tr_1 = phytree([1 2;3 4;5 6],bd) +% view(tr_1) +% weights(tr_1) +% +% See also MULTIALIGN, PHYTREE, PROFALIGN, SEQLINKAGE. + +% References: +% J.D. Thompson, D.G. Higgins, and T.J. Gibson. Nucleic Acids Res. (1994) +% 22(22):4673-4680. +% S.Henikoff and J. G. Henikoff. JMB. (1994) 243(4):574--578. +% +% Copyright 2003-2005 The MathWorks, Inc. +% $Revision: 1.1.8.2 $Author: batserve $ $Date: 2005/06/17 20:19:24 $ + +if numel(tr)~=1 + error('Bioinfo:phytree:weights:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end + +numBranches = size(tr.tree,1); +numLeaves = numBranches + 1; +numLabels = numBranches + numLeaves; + +% calculate the branch width +branchWidth = ones(numLabels,1); +for ind = 1:numBranches + branchWidth(numLeaves+ind) = sum(branchWidth(tr.tree(ind,:))); +end + +% adjust the distances +tr.dist = tr.dist ./ branchWidth; + +% calculate distance of every leave to root +cdist = tr.dist; +for ind = numBranches:-1:1 + cdist(tr.tree(ind,:)) = cdist(tr.tree(ind,:)) + cdist(ind+numLeaves); +end + +W = cdist(1:numLeaves); +W = W./max(W); + diff --git a/matlab/graph/@phyTree/getByName.m b/matlab/graph/@phyTree/getByName.m new file mode 100644 index 0000000..4b74557 --- /dev/null +++ b/matlab/graph/@phyTree/getByName.m @@ -0,0 +1,103 @@ +function sel = getbyname(tr,query,varargin) +%GETBYNAME Selects branches and leaves by name. +% +% S = GETBYNAME(T,EXPRESSION) returns a logical vector S of size +% [NUMNODES x 1] indicating the node names of the phylogenetic tree T +% that match the regular expression EXPRESSION regardless of case. +% +% Symbols than can be used in a matching regular expression are explained +% in help REGEXP. +% +% When EXPRESSION is a cell array of strings, GETBYNAME returns a matrix +% where every column corresponds to every query in EXPRESSION. +% +% S = GETBYNAME(T,STRING,'EXACT',true) looks for exact matches only +% (ignoring case). When STRING is a cell array of strings, GETBYNAME +% returns a vector with indices. +% +% Example: +% +% % Load a phylogenetic tree created from a protein family: +% tr = phytreeread('pf00002.tree'); +% +% % Select all the 'mouse' and 'human' proteins: +% sel = getbyname(tr,{'mouse','human'}); +% view(tr,any(sel,2)); +% +% See also PHYTREE, PHYTREE/PRUNE, PHYTREE/SELECT, PHYTREE/GET. + +% Copyright 2003-2005 The MathWorks, Inc. +% $Revision: 1.1.6.5 $ $Author: batserve $ $Date: 2005/06/09 21:55:55 $ + +if numel(tr)~=1 + error('Bioinfo:phytree:getbyname:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end + +doExactMatch = false; + +if nargin > 2 + okargs = {'exact',''}; + for j=1:2:nargin-2 + pname = varargin{j}; + k = strmatch(lower(pname), okargs); %#ok + if isempty(k) + error('Bioinfo:phytree:getbyname:UnknownParameterName',... + 'Unknown parameter name: %s.',pname); + elseif length(k)>1 + error('Bioinfo:phytree:getbyname:AmbiguousParameterName',... + 'Ambiguous parameter name: %s.',pname); + else + switch(k) + case 1 + if nargin == 3 + doExactMatch = true; + else + doExactMatch = opttf(varargin{j+1}); + if isempty(doExactMatch) + error('Bioinfo:phytree:getbyname:InputOptionNotLogical',... + '%s must be a logical value, true or false.',... + upper(char(okargs(k)))); + end + end + end %switch + end %if + end %for +end %if + + +numLabels = numel(tr.names); +if iscell(query) + if doExactMatch + sel = zeros(numLabels,1); + else + sel = false(numLabels,numel(query)); + end + for ind = 1:numel(query) + if doExactMatch + sel(strcmpi(query{ind},tr.names)) = ind; + else + try + regexpiOutput = regexpi(tr(:).names,query{ind}); + catch + error('Bioinfo:phytree:getbyname:IncorrectRegularExpression',... + ['The query expression produced the following error in ' ... + 'REGEXPI: \n%s'],lasterr); + end + sel(:,ind) = ~cellfun('isempty',regexpiOutput); + end + end +else % must be a single string of chars + if doExactMatch + sel = strcmpi(query,tr.names); + else + try + regexpiOutput = regexpi(tr(:).names,query); + catch + error('Bioinfo:phytree:getbyname:IncorrectRegularExpression',... + ['The query expression produced the following error in ' ... + 'REGEXPI: \n%s'],lasterr); + end + sel = ~cellfun('isempty',regexpiOutput); + end +end diff --git a/matlab/graph/@phyTree/getCanonical.m b/matlab/graph/@phyTree/getCanonical.m new file mode 100644 index 0000000..6b0db81 --- /dev/null +++ b/matlab/graph/@phyTree/getCanonical.m @@ -0,0 +1,65 @@ +function [ptrs,dist,names] = getcanonical(tr) +%GETCANONICAL Calculates the canonical form of a phylogenetic tree. +% +% PTRS = GETCANONICAL(TREE) Returns the pointers of the canonical form of +% a phylogenetic tree. In a canonical tree the leaves are ordered +% alphabetically and the branches are ordered first by their width and +% then alphabetically by their first element. A canonical tree is +% isomorphic to all the trees with the same skeleton independently of the +% order of their leaves and branches. +% +% [PTRS,DIST,NAMES] = GETCANONICAL(TREE) Returns also the re-ordered +% distances and node names. +% +% Example: +% % create two trees with same skeleton but slightly different distances +% b = [1 2; 3 4; 5 6; 7 8;9 10]; +% tr_1 = phytree(b,[.1 .2 .3 .3 .4 ]'); +% tr_2 = phytree(b,[.2 .1 .2 .3 .4 ]'); +% plot(tr_1) +% plot(tr_2) +% +% % compare if the two trees are isomorphic +% isequal(getcanonical(tr_1),getcanonical(tr_2)) +% +% See also PHYTREE, PHYTREEREAD, PHYTREE/GETBYNAME, PHYTREE/SELECT, +% PHYTREE/SUBTREE. + +% Copyright 2003-2006 The MathWorks, Inc. +% $Revision: 1.1.8.2 $ $Author: batserve $ $Date: 2006/06/16 20:06:42 $ + +if numel(tr)~=1 + error('Bioinfo:phytree:getcanonical:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end + +numBranches = size(tr.tree,1); +numLeaves = numBranches + 1; +numLabels = numBranches + numLeaves; + +[dummy,h] = sort(tr.names(1:numLeaves)); %#ok +h(h)=1:numLeaves; + +% compute the branch width and the first element for each one +branchWidth = ones(numLabels,1); +firstElement = [h;inf(numBranches,1)]; +for ind = 1:numBranches + branchWidth(numLeaves+ind) = sum(branchWidth(tr.tree(ind,:))); + firstElement(numLeaves+ind) = min(firstElement(tr.tree(ind,:))); +end + +% find out how to re-order +[dummy,ord]=sortrows([branchWidth firstElement]); %#ok +iord(ord) = 1:numLabels; + +% re-order pointers +ptrs = sort(iord(tr.tree(ord(numLeaves+1:numLabels)-numLeaves,:)),2); + +if nargout > 1 + dist = tr.dist(ord); +end +if nargout > 2 + names = tr.names(ord); +end + + diff --git a/matlab/graph/@phyTree/getMatrix.m b/matlab/graph/@phyTree/getMatrix.m new file mode 100644 index 0000000..39a02ec --- /dev/null +++ b/matlab/graph/@phyTree/getMatrix.m @@ -0,0 +1,62 @@ +function [cm lab dist] = getmatrix(tr,varargin) +%GETMATRIX converts a Phytree Object into a relationship matrix. +% +% [MATRIX, ID, DISTANCES] = GETMATRIX(T) converts the phylogenetic tree +% object T into a logical sparse matrix, where 1's indicate that a branch +% node (row index) is connected to its child (column index). The child +% can be either another branch node or a leaf node. ID is a list of the +% labels that correspond to the rows and columns of MATRIX, first the +% leaf nodes from 1 to NUMLEAVES, then the branch nodes from NUMLEAVES+1 +% to NUMLEAVES+NUMBRANCHES, being the root the last node. DISTANCES is +% a column vector with one entry for every nonzero entry in MATRIX +% traversed columnwise and representing the distance between the branch +% node and the child. +% +% Example: +% +% T = phytreeread('pf00002.tree') +% [MATRIX ID DIST] = getmatrix(T); +% +% See also PHYTREE, PHYTREE/GET, PHYTREE/PDIST, PHYTREE/PRUNE, PHYTREETOOL. + +% Copyright 2006 The MathWorks, Inc. +% $Revision: 1.1.6.1 $ $Date: 2006/06/16 20:06:43 $ + + + %%% check arguments +if nargin > 1 + if rem(nargin,2) == 0 + error('Bioinfo:phytree:getmatrix:IncorrectNumberOfArguments',... + 'Incorrect number of arguments to %s.',mfilename); + end + okargs = {'no_input_arguments'}; + for j=1:2:nargin-2 + pname = varargin{j}; + pval = varargin{j+1}; + if(isstruct(pval)) + error('Bioinfo:phytree:getmatrix:StructParamError',... + 'parameter cannot be a struct'); + end + k = find(strncmpi(pname, okargs,length(pname))); + if isempty(k) + error('Bioinfo:phytree:getmatrix:UnknownParameterName',... + 'Unknown parameter name: %s.',pname); + elseif length(k)>1 + error('Bioinfo:phytree:getmatrix:AmbiguousParameterName',... + 'Ambiguous parameter name: %s.',pname); + else + switch(k) + end + end + end +end + +numLeaves = size(tr.tree,1)+1; +numNodes = numLeaves + numLeaves -1; +cm = sparse(repmat(numLeaves+1:numNodes,1,2),tr.tree(:),true,numNodes,numNodes); +if nargout>1 + lab = tr.names; +end +if nargout>2 + dist = tr.dist; +end diff --git a/matlab/graph/@phyTree/getNewickStr.m b/matlab/graph/@phyTree/getNewickStr.m new file mode 100644 index 0000000..797f933 --- /dev/null +++ b/matlab/graph/@phyTree/getNewickStr.m @@ -0,0 +1,133 @@ +function str = getnewickstr(tr,varargin) +%GETNEWICKSTR creates a NEWICK formatted string. +% +% STR = GETNEWICKSTR(TREE) returns the NEWICK formatted string of the +% phylogenetic tree object TREE. +% +% GETNEWICKSTR(...,'DISTANCES',false) excludes the distances from the +% output. Default is true. +% +% GETNEWICKSTR(...,'BRANCHNAMES',true) includes the branch names into the +% output. Default is false. +% +% The NEWICK tree format is found at: +% http://evolution.genetics.washington.edu/phylip/newicktree.html +% +% Example: +% +% seqs = int2nt(ceil(rand(10)*4)); % some random sequences +% dist = seqpdist(seqs,'alpha','nt'); % pairwise distances +% tree = seqlinkage(dist); % construct phylogenetic tree +% str = getnewickstr(tree) % get the NEWICK string +% +% See also PHYTREE, PHYTREEREAD, PHYTREEWRITE, PHYTREETOOL, SEQLINKAGE, +% PHYTREE/GET, PHYTREE/GETBYNAME, PHYTREE/GETCANONICAL. + +% Undocumented: +% GETSTR(...,'MULTILINE',true) introduces 'new line' characters for a +% multi-line output. This option is used by PHYTREEWRITE. Default is +% false. + +% Copyright 2003-2005 The MathWorks, Inc. +% $Revision: 1.1.8.1 $ $Author: batserve $ $Date: 2005/06/09 21:55:57 $ + +if numel(tr)~=1 + error('Bioinfo:phytree:getstr:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end + +numBranches = size(tr.tree,1); +numLeaves = numBranches + 1; +numLabels = numBranches + numLeaves; + +% set defaults +writeDistances = true; +writeBranchNames = false; +multiLine = false; + +nvarargin = numel(varargin); +if nvarargin + if rem(nvarargin,2) + error('Bioinfo:phytree:getstr:IncorrectNumberOfArguments',... + 'Incorrect number of arguments to %s.',mfilename); + end + okargs = {'multiline','distances','branchnames'}; + for j=1:2:nvarargin + pname = varargin{j}; + pval = varargin{j+1}; + k = find(strncmpi(pname,okargs,numel(pname))); + if isempty(k) + error('Bioinfo:phytree:getstr:UnknownParameterName',... + 'Unknown parameter name: %s.',pname); + elseif length(k)>1 + error('Bioinfo:AmbiguousParameterName',... + 'Ambiguous parameter name: %s.',pname); + else + switch(k) + case 1 % multi-lines + multiLine = opttf(pval); + if isempty(multiLine) + error('Bioinfo:phytree:getstr:multiLineOptionNotLogical',... + '%s must be a logical value, true or false.',... + upper(char(okargs(k)))); + end + case 2 % write distances + writeDistances = opttf(pval); + if isempty(writeDistances) + error('Bioinfo:phytree:getstr:writeDistancesOptionNotLogical',... + '%s must be a logical value, true or false.',... + upper(char(okargs(k)))); + end + case 3 % write branch names + writeBranchNames = opttf(pval); + if isempty(writeBranchNames) + error('Bioinfo:phytree:getstr:writeBranchNamesOptionNotLogical',... + '%s must be a logical value, true or false.',... + upper(char(okargs(k)))); + end + end + end + end +end + +for i=1:numLabels-1; + if (i<=numLeaves || writeBranchNames) + if writeDistances + namedist{i} = [tr.names{i} ':' num2str(tr.dist(i))]; %#ok + else + namedist{i} = tr.names{i}; + end + elseif writeDistances && ~writeBranchNames + namedist{i} = [':' num2str(tr.dist(i))]; %#ok + else + namedist{i} = ''; + end +end + +if writeBranchNames + namedist{numLabels} = [tr.names{numLabels} ';']; +else + namedist{numLabels} = ';'; +end + +for i=1:numBranches + if tr.tree(i,1) > numLeaves + t1 = branchstr{tr.tree(i,1)}; + else + t1 = namedist{tr.tree(i,1)}; + end + if tr.tree(i,2) > numLeaves + t2 = branchstr{tr.tree(i,2)}; + else + t2 = namedist{tr.tree(i,2)}; + end + branchstr{i+numLeaves} = ... + [ '(\n' t1 ',\n' t2 ')\n' , namedist{i+numLeaves} ]; %#ok +end + +str = sprintf(branchstr{numLabels}); + +if ~multiLine + str = strrep(str,sprintf('\n'),''); +end + diff --git a/matlab/graph/@phyTree/openVar.m b/matlab/graph/@phyTree/openVar.m new file mode 100644 index 0000000..47e48d6 --- /dev/null +++ b/matlab/graph/@phyTree/openVar.m @@ -0,0 +1,12 @@ +function openvar(name, tr) %#ok +%OPENVAR Opens a phylogenetic tree object for graphical editing. + +% Copyright 2003-2006 The MathWorks, Inc. +% $Revision: 1.1.6.3 $ $Author: batserve $ $Date: 2006/06/16 20:06:44 $ + +try + view(tr); +catch + % rethrows the error into a dlg window + errordlg(lasterr, 'Inspection error', 'modal'); +end diff --git a/matlab/graph/@phyTree/pDist.m b/matlab/graph/@phyTree/pDist.m new file mode 100644 index 0000000..e543872 --- /dev/null +++ b/matlab/graph/@phyTree/pDist.m @@ -0,0 +1,153 @@ +function [dist,comm] = pdist(tr,varargin) +%PDIST computes the pairwise patristic distance. +% +% D = PDIST(TREE) returns a vector D containing the patristic +% distances between every possible pair of leaf nodes in the phylogenic +% tree object TREE. The distance is computed following the path through +% the branches of the tree. +% +% The output vector D is arranged in the order of ((2,1),(3,1),..., +% (M,1),(3,2),...(M,3),.....(M,M-1)), i.e. the lower left triangle of the +% full M-by-M distance matrix. To get the distance between the Ith and +% Jth nodes (I > J) use the formula D((J-1)*(M-J/2)+I-J). M is the +% number of leaves) +% +% D = PDIST(...,'NODES',N) indicates the nodes to be included in the +% computation. N can be 'leaves' (default) or 'all'. In the former +% case the output will be order as before, but M is the total number of +% nodes in the tree, i.e. NUMLEAVES+NUMBRANCHES. +% +% D = PDIST(...,'SQUAREFORM',true) coverts the output into a square +% format, so that D(I,J) denotes the distance between the Ith and the Jth +% node. The output matrix is symmetric and has a zero diagonal. +% +% D = PDIST(...,'CRITERIA',C) changes the criteria used to relate pairs. +% C can be 'distance' (default) or 'levels'. +% +% [D,C] = PDIST(TREE) returns in C the index of the closest common parent +% nodes for every possible pair of query nodes. +% +% Example: +% +% % get the tree distances between every leaf +% tr = phytreeread('pf00002.tree') +% dist = pdist(tr,'nodes','leaves','squareform',true) +% +% See also SEQPDIST, SEQLINKAGE, PHYTREE, PHYTREETOOL. + +% Copyright 2003-2005 The MathWorks, Inc. +% $Revision: 1.1.6.6 $ $Author: batserve $ $Date: 2005/06/09 21:55:59 $ + +if numel(tr)~=1 + error('Bioinfo:phytree:pdist:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end + +% set default +squaredOutput = false; +CriteriaIsLevels = false; +outNodes = 'leaves'; + +numBranches = size(tr.tree,1); +numLeaves = numBranches + 1; +numLabels = numBranches + numLeaves; + +% process input arguments +if nargin > 1 + if rem(nargin,2) == 0 + error('Bioinfo:phytree:pdist:IncorrectNumberOfArguments',... + 'Incorrect number of arguments to %s.',mfilename); + end + okargs = {'nodes','squareform','criteria'}; + for ind = 2 : 2: nargin + pname = varargin{ind-1}; + pval = varargin{ind}; + k = find(strncmpi(pname,okargs,numel(pname))); + if isempty(k) + error('Bioinfo:phytree:pdist:UnknownParameterName',... + 'Unknown parameter name: %s.',pname); + elseif length(k)>1 + error('Bioinfo:phytree:pdist:AmbiguousParameterName',... + 'Ambiguous parameter name: %s.',pname); + else + switch(k) + case 1 % nodes + okNodes = {'leaves','all','branches'}; + h = strmatch(lower(pval),okNodes ); %#ok + if isempty(h) + error('Bioinfo:phytree:pdist:IncorrectReferenceNode',... + 'Incorrect node selection') + else + outNodes = okNodes{h}; + end + case 2 % squareform + squaredOutput = (pval == true); + case 3 % criteria + h = strmatch(lower(pval),{'levels','distance'}); %#ok + if numel(h) + CriteriaIsLevels = (h == 1); + else error('Bioinfo:phytree:pdist:InvalidCriteria',... + 'Invalid string for criteria.'); + end + end + end + end +end + + +% create indexes to work only the lower leff triangle +m = numLabels*(numLabels-1)/2; +p = cumsum([1 (numLabels-1):-1:2]); +I = ones(m,1); I(p) = [2 3-numLabels:0]; +J = zeros(m,1); J(p) = 1; +H = I; H(p) = 2:numLabels; +I = cumsum(I); J = cumsum(J); H = cumsum(H); + +switch outNodes + case 'leaves' + outSelection = (I <= numLeaves) & (J <= numLeaves); + case 'all' + outSelection = (I>0); + case 'branches' + outSelection = (I > numLeaves) & (J > numLeaves); +end +% find closest common branch for every pair of nodes +% diagonal is invalid ! but not needed + +% initializing full matrix +commf = zeros(numLabels,'int16'); +children = false(1,numLabels); +for ind = numBranches:-1:1 + children(:) = false; + children(ind+numLeaves) = true; + for ind2 = ind:-1:1 + if children(ind2+numLeaves) + children(tr.tree(ind2,:))=true; + end + end + commf(children,children)=int16(ind); +end + +% output vector with the lower leff triangle closest common branches +comm = double(commf(H)); + +% compute the distance to root for every node +cdist = tr.dist; +if CriteriaIsLevels % set to count levels instead + cdist(:) = 1; + cdist(end) = 0; +end +for ind = numBranches:-1:1 + cdist(tr.tree(ind,:)) = cdist(tr.tree(ind,:)) + cdist(ind+numLeaves); +end + +% compute pairwise distance +dist = cdist(I)+cdist(J)-2*cdist(comm+numLeaves); + +dist = dist(outSelection); +comm = comm(outSelection); + +if squaredOutput + dist = squareform(dist); + comm = squareform(comm); +end diff --git a/matlab/graph/@phyTree/phyTree.m b/matlab/graph/@phyTree/phyTree.m new file mode 100644 index 0000000..bf7f9bc --- /dev/null +++ b/matlab/graph/@phyTree/phyTree.m @@ -0,0 +1,235 @@ +function tr = phyTree(varargin) +%PHYTREE Phylogenetic tree object. +% +% TREE = PHYTREE(B) creates an ultrametric phylogenetic tree object. B is +% a numeric array of size [NUMBRANCHES X 2] in which every row represents +% a branch of the tree and it contains two pointers to the branches +% or leaves nodes which are its children. Leaf nodes are numbered from 1 +% to NUMLEAVES and branch nodes are numbered from NUMLEAVES + 1 to +% NUMLEAVES + NUMBRANCHES. Note that since only binary trees are allowed, +% then NUMLEAVES = NUMBRANCHES + 1. Branches are defined in chronological +% order, i.e. B(i,:) > NUMLEAVES + i. As a consequence, the first row can +% only have pointers to leaves and the last row must represent the 'root' +% branch. Parent-child distances are set to the unit or by the ultrametric +% condition if child is a leaf. +% +% TREE = PHYTREE(B,D) creates an additive phylogenetic tree object with +% branch distances defined by D. D is a numeric array of size [NUMNODES X +% 1] with the distances of every child node (leaf or branch) to its parent +% branch. NUMNODES = NUMLEAVES + NUMBRANCHES. D(end), the distance +% associated to the root node, is meaningless. +% +% TREE = PHYTREE(B,C) creates an ultrametric phylogenetic tree object with +% branch distances defined by C. C is a numeric array of size [NUMBRANCHES +% X 1] with the coordinates of every branch node. In ultrametric tress all +% the leaves are at the same location (i.e. same distance to the root). +% +% TREE = PHYTREE(BC) creates an ultrametric phylogenetic binary tree +% object with branch pointers in BC(:,[1 2]) and branch coordinates in +% BC(:,3). Same as PHYTREE(B,C). +% +% TREE = PHYTREE(...,N) specifies the names for the leaves and/or the +% branches. N is a cell of strings. If NUMEL(N)==NUMLEAVES then the names +% are assigned chronologically to the leaves. If NUMEL(N)==NUMBRANCHES the +% names are assigned to the branch nodes. If NUMEL(N)==NUMLEAVES + +% NUMBRANCHES all the nodes are named. Unassigned names default to 'Leaf +% #' and/or 'Branch #' as required. +% +% TREE = PHYTREE creates an empty phylogenectic tree object +% +% Example: +% +% % create an ultrametric tree +% b = [1 2; 3 4; 5 6; 7 8;9 10]; +% t = phytree(b); +% view(t) +% +% % create an ultrametric tree with specified branch distances +% bd = [.1 .2 .3 .3 .4 ]'; +% b = [1 2; 3 4; 5 6; 7 8;9 10]; +% t = phytree(b,bd); +% view(t) +% +% See also PHYTREE/GET, PHYTREE/SELECT, PHYTREEREAD, PHYTREETOOL, +% PHYTREEWRITE, SEQLINKAGE, SEQNEIGHJOIN, SEQPDIST. + + +% Copyright 2003-2006 The MathWorks, Inc. +% $Revision: 1.1.6.11.2.1 $ $Author: batserve $ $Date: 2006/07/27 21:37:49 $ + +justVerifyValidity = false; + +switch nargin + case 0 + tr.tree = zeros(0,2); + tr.dist = zeros(0,1); + tr.names = {}; + case 1 + B = varargin{1}; + case 2 + B = varargin{1}; + if iscell(varargin{2}) + N = varargin{2}; + else + D = varargin{2}; + end + case 3 + B = varargin{1}; + D = varargin{2}; + N = varargin{3}; + otherwise + error('Bioinfo:phytree:IncorrectNumberOfArguments',... + 'Incorrect number of arguments to %s.',mfilename); +end + +if nargin==1 && isstruct(B) && isfield(B,'tree') && isfield(B,'names') && isfield(B,'dist') + N = B.names; + D = B.dist; + B = B.tree; + tr.tree = B; + tr.dist = D; + tr.names = N; + justVerifyValidity = true; +end + +if nargin + if isnumeric(B) + switch size(B,2) + case 2 + % ok + case 3 + D = B(:,3); + B(:,3)=[]; + otherwise + error('Bioinfo:phytree:IncorrectSize','Incorrect size for B or BC') + end + else + error('Bioinfo:phytree:IncorrectType','Incorrect type for B or BC') + end + + % test B + if sum(diff(sort(B(:)))~=1) || (min(B(:))~=1) + error('Bioinfo:phytree:IncompleteTree','Branch architecture is not complete') + end + numBranches = size(B,1); + numLeaves = numBranches + 1; + numLabels = numBranches + numLeaves; + h=all(B'>=repmat(numLeaves+1:numLabels,2,1)); + if any(h) + error('Bioinfo:phytree:NonChronologicalTree',... + ['Branch(es) not in chronological order: [' num2str(find(h)) ']']) + end + + if exist('D','var') + if ~isnumeric(D) || any(D(:)<0) || ~all(isreal(D)) || size(D,2)~=1 + error('Bioinfo:phytree:DistancesNotValid',... + 'Distances must be a column vector of real positive numbers') + end + switch size(D,1) + case numBranches + D = [zeros(numLeaves,1); D]; % add ultrametric distances of leaves + D(B) = D((numLeaves+(1:numBranches))'*[1 1])-D(B); %dist of edges + D(end) = 0; % set root at zero + case numLabels + % ok + otherwise + error('Bioinfo:phytree:DistancesNotValid',... + 'Distances must agree either with the number of branches (C) or total nodes (D)') + end + else % set defaut D + % look for parents + P = zeros(numLabels,1); + P(B) = repmat((1:numBranches)',1,2); + P(end) = numBranches; + % look at which level is every branch + L = zeros(numLabels,1); + for ind = 1:numBranches + L(ind+numLeaves) = max(L(B(ind,:))+1); + end + D = L(P+numLeaves)-L; + end + + % set default names + for ind = 1:numLeaves + names{ind}=['Leaf ' num2str(ind)]; %#ok + end + for ind = 1:numBranches + names{ind+numLeaves}=['Branch ' num2str(ind)]; + end + + if exist('N','var') + if ~iscell(N) + error('Bioinfo:phytree:NamesNotValid',... + 'Names must be supplied with a cell of strings') + end + switch numel(N) + case numLabels + h = 1:numLabels; + case numLeaves + h = 1:numLeaves; + case numBranches + h = numLeaves+1:numLabels; + otherwise + error('Bioinfo:phytree:NamesNotValid',... + 'Names must agree either with the number of branches, number of leaves, or total nodes') + end + for ind = 1:length(h); + str = N{ind}; + if ~ischar(str) + error('Bioinfo:phytree:NamesNotValid',... + 'Names must be valid strings') + end + names{h(ind)}=str; + end + % check that none of the names is empty + for ind = 1:numLabels + if isempty(names{ind}) + if ind > numLeaves + names{ind} = ['Branch ' num2str(ind-numLeaves)]; + else + names{ind} = ['Leaf ' num2str(ind)]; + end + end + end + if numel(unique(names))~=numLabels + error('Bioinfo:phytree:NamesNotUnique',... + 'Names for leaves and branches must be unique') + end + end + + % check and corrects a non-monotonic tree + monotonicWarning = false; + for ind = 1:numBranches + if any(D(B(ind,:))<0) + monotonicWarning = true; + tmp = min(D(B(ind,:))); + D(B(ind,:)) = D(B(ind,:)) - tmp; + D(numLeaves+ind) = D(numLeaves+ind) + tmp; + end + end + if monotonicWarning + warning('Bioinfo:phytree:NonMonotonicTree',... + 'Non consistent branch distances; \n Incremented branch lengths to hold a Monotonic Phylogenetic Tree') + end + + if justVerifyValidity + tr = class(tr,'phytree'); + return + end + + tr.tree = B; + tr.dist = D; + tr.names = names(:); + + % reorder such that there will be no crossings in the displayed tree + tr = prettyOrder(tr); + +end %if nargin + +% for trees of only one branch correct dimensions +% if size(tr.tree,2) <2 tr.tree = tr.tree'; end + +% Makes the tree a class +tr = class(tr,'phyTree'); + + diff --git a/matlab/graph/@phyTree/private/Opttf.m b/matlab/graph/@phyTree/private/Opttf.m new file mode 100644 index 0000000..07be462 --- /dev/null +++ b/matlab/graph/@phyTree/private/Opttf.m @@ -0,0 +1,31 @@ +function tf = opttf(pval) +%OPTTF determines whether input options are true or false + +% Copyright 2003-2004 The MathWorks, Inc. +% $Revision: 1.3.4.2 $ $Date: 2004/12/24 20:42:39 $ + + +if islogical(pval) + tf = all(pval); + return +end +if isnumeric(pval) + tf = all(pval~=0); + return +end +if ischar(pval) + truevals = {'true','yes','on','t'}; + k = any(strcmpi(pval,truevals)); + if k + tf = true; + return + end + falsevals = {'false','no','off','f'}; + k = any(strcmpi(pval,falsevals)); + if k + tf = false; + return + end +end +% return empty if unknown value +tf = logical([]); \ No newline at end of file diff --git a/matlab/graph/@phyTree/private/prettyOrder.m b/matlab/graph/@phyTree/private/prettyOrder.m new file mode 100644 index 0000000..1f06509 --- /dev/null +++ b/matlab/graph/@phyTree/private/prettyOrder.m @@ -0,0 +1,39 @@ +function tr = prettyOrder(tr) +%PRETTYORDER Reorders the leaf nodes to avoid branch crossings. +% +% T2 = PRETTYORDER(T1) Reorders the leaf nodes in the phylogenetic tree +% T1 such that the layout of the tree does not contain branch crossings. + +% Copyright 2003-2005 The MathWorks, Inc. +% $Revision: 1.1.8.1 $ $Author: batserve $ $Date: 2005/06/09 21:56:11 $ + +numBranches = size(tr.tree,1); +numLeaves = numBranches + 1; +numLabels = numBranches + numLeaves; + +L = [ones(numLeaves,1); zeros(numBranches,1)]; +for ind = 1 : numBranches + L(ind+numLeaves) = sum(L(tr.tree(ind,:))); +end +X = zeros(numLabels,1); +for ind = numBranches:-1:1 + X(tr.tree(ind,:)) = tr.dist(tr.tree(ind,:))+X(ind+numLeaves); +end +Li = zeros(1,numLabels); Ls = Li; +Ls(numLabels) = numLeaves; +for ind = numBranches:-1:1 + Ls(tr.tree(ind,:)) = Ls(ind+numLeaves); + Li(tr.tree(ind,:)) = Li(ind+numLeaves); + if diff(X(tr.tree(ind,:)))>=0 + Ls(tr.tree(ind,1)) = Li(tr.tree(ind,1)) + L(tr.tree(ind,1)); + Li(tr.tree(ind,2)) = Ls(tr.tree(ind,2)) - L(tr.tree(ind,2)); + else + Ls(tr.tree(ind,2)) = Li(tr.tree(ind,2)) + L(tr.tree(ind,2)); + Li(tr.tree(ind,1)) = Ls(tr.tree(ind,1)) - L(tr.tree(ind,1)); + end +end + +tr.names(Ls(1:numLeaves))=tr.names(1:numLeaves); +tr.dist(Ls(1:numLeaves))=tr.dist(1:numLeaves); +Ls(numLeaves+1:numLabels)=numLeaves+1:numLabels; +tr.tree = Ls(tr.tree); diff --git a/matlab/graph/@phyTree/reRoot.m b/matlab/graph/@phyTree/reRoot.m new file mode 100644 index 0000000..e1f43f4 --- /dev/null +++ b/matlab/graph/@phyTree/reRoot.m @@ -0,0 +1,206 @@ +function tr = reRoot(tr,node,distance) +%REROOT changes the root of a phylogenetic tree. +% +% T2 = REROOT(T1) changes the root of the phylogenetic tree T1 using the +% mid-point method. The mid-point is the location where the means of +% the branch lengths of either side of the tree are equalized. The +% original root is deleted. +% +% T2 = REROOT(T1,NODE) changes the root to the branch indexed by NODE. +% The new root is placed at half the distance between NODE and its +% parent. +% +% T2 = REROOT(T1,NODE,DISTANCE) re-roots T1 by placing the new root at a +% given DISTANCE from the reference NODE towards the root of the tree. +% +% Note: The new branch in T2 representing the root is labeled as 'Root'. +% +% Example: +% +% % Create an ultrametric tree +% tr_1 = phytree([5 7;8 9;6 11; 1 2;3 4;10 12;14 16;15 17;13 18]) +% plot(tr_1,'branchlabels',true) +% +% % Place the new root at 'Branch 7' +% sel = getbyname(tr_1,'Branch 7'); +% tr_2 = reroot(tr_1,sel) +% plot(tr_2,'branchlabels',true) +% +% % The mid-point of the original tree was the root, since it was an +% % ultrametric tree +% tr_3 = reroot(tr_2) +% plot(tr_3,'branchlabels',true) +% +% See also PHYTREE, PHYTREE/GET, PHYTREE/GETBYNAME, PHYTREE/PRUNE, +% PHYTREE/SELECT, SEQNEIGHJOIN. + +% Copyright 2003-2006 The MathWorks, Inc. +% $Revision: 1.1.8.3 $ $Author: batserve $ $Date: 2006/06/16 20:06:46 $ + +if numel(tr)~=1 + error('Bioinfo:phytree:reroot:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end +numBranches = size(tr.tree,1); +numLeaves = numBranches + 1; +numLabels = numBranches + numLeaves; + +if nargin == 1; + [node,distance] = midpoint(tr); +else + % validate node + if islogical(node) + if any(numel(node) == [numLabels numLeaves]) + node = find(node); + elseif numel(node) == numBranches + node = find(node) + numLeaves; + else + error('Bioinfo:phytree:reroot:IncorrectSizeInputVector',... + 'Logical vector must have the same number of elements as nodes in the Phylogenetic Tree.'); + end + end + if ~isscalar(node) || node<1 || node>numLabels + error('Bioinfo:phytree:reroot:InvalidInputNode',... + 'Invalid value for NODE.'); + end + node = round(node); + % when no distance is given put the root at half the branch + if nargin<3 + distance = tr.dist(node)/2; + end +end + +% find parents for every tree node +parent = zeros(numLabels,1); +parent(tr.tree) = repmat(numLeaves+1:numLabels,1,2); + +% validate distance, if necessary shift the origin node +if ~isscalar(distance) || distance<0 + error('Bioinfo:phytree:reroot:InvalidInputDistance',... + 'Invalid value for DISTANCE.'); +end + +% validate distance, if necessary shift the origin node +while (tr.dist(node)<=distance) && (node ~= numLabels) + distance = distance - tr.dist(node); + node = parent(node); +end + +if node == numLabels + if distance>0 + warning('Bioinfo:phytree:reroot:BeyondRoot',... + 'Distance goes beyond the root, tree unchanged.') + else + tr.names{end} = 'Root'; + end + return +end + +% check if we just need to move the branches of current root +if parent(node) == numLabels + tr.dist(setxor(tr.tree(end,:),node)) = ... + sum(tr.dist(tr.tree(end,:))) - distance; + tr.dist(node) = distance; + tr.names{end} = 'Root'; + return +end + +% path to root from node and bros for every point in the path +path2r = false(numBranches,1); +pathBros = []; +me = node; +par = parent(node); +while par + pathBros = [pathBros;setxor(tr.tree(par-numLeaves,:),me)]; + path2r(par-numLeaves) = true; + me = par; + par = parent(par); +end +path2rInd=find(path2r)+numLeaves; + +% new tree pointers +tr.tree = [tr.tree(~path2r,:);... + [[pathBros(end-1:-1:1);node],[pathBros(end);path2rInd(end-1:-1:1)]]]; + +% swapping distances in the nodes that belong to the path2r +tr.dist(pathBros(end)) = tr.dist(pathBros(end)) + tr.dist(path2rInd(end-1)); +tr.dist(path2rInd(2:end-1)) = tr.dist(path2rInd(1:end-2)); +tr.dist(parent(node)) = tr.dist(node) - distance; +tr.dist(node) = distance; + +% some branches changed positions, need to tree +permuta = [(1:numLeaves)';find(~path2r)+numLeaves;... + path2rInd(end-1:-1:1);numLabels]; +ipermuta(permuta)=1:numLabels; +tr.tree = ipermuta(tr.tree); +tr.dist = tr.dist(permuta); +tr.names = tr.names(permuta); +tr.names{end} = 'Root'; + +% re-order leaves for no branch crossings +tr = prettyorder(tr); + +%-% ---------------------------------------------------------------- +% Selects the point where the mean of the branch length is equalized +function [node,distance] = midpoint(tr) + +numBranches = size(tr.tree,1); +numLeaves = numBranches + 1; +numLabels = numBranches + numLeaves; + + +branchWidth = ones(numLabels,1); +downDist = zeros(numLabels,1); +upDist = zeros(numLabels,1); +% cumulative distance and width downwards the tree +for ind = 1:numBranches + branchWidth(numLeaves+ind) = sum(branchWidth(tr.tree(ind,:))); + downDist(numLeaves+ind) = sum(downDist(tr.tree(ind,:)) + ... + tr.dist(tr.tree(ind,:)).*branchWidth(tr.tree(ind,:))); +end +% backpropagate distances +for ind = numBranches:-1:1 + upDist(tr.tree(ind,:)) = downDist(tr.tree(ind,[2 1])) +... + upDist(ind+numLeaves) + ... + tr.dist(ind+numLeaves).*(numLeaves-branchWidth(ind+numLeaves)) + ... + tr.dist(tr.tree(ind,[2 1])).*(branchWidth(tr.tree(ind,[2 1]))); +end + +% find all possible midpoints, solve this eq for every edge +% ud/Nu + (x)*e = dd/Nd + (1-x)*e +% ud = cumulative upwards distances +% dd = cumulative downwards distances +% Nu = number of leaves in the upper braches +% Nd = number of leaves in the lower braches +% e = distance of current edge + +h = tr.dist~=0; % root can not be in an edge which length is zero +h(numLabels) = false; % the route can not be segmented +x = inf(numLabels,1); +x(h) = (upDist(h)./branchWidth(h) - ... + downDist(h)./(numLeaves-branchWidth(h)))./tr.dist(h)/2 + 1/2; + +x(h) = (upDist(h)./(numLeaves-branchWidth(h)) - ... + downDist(h)./(branchWidth(h)))./tr.dist(h)/2 + 1/2; + +% find all possible roots +h = find(x>=0 & x<=1); +if isempty(h) + [dummy,h] = min(abs(x-1/2)); %#ok +end +% pick the most balanced one +[d,g]=min(abs(branchWidth(h)*2-numLeaves)); %#ok + +node = h(g); +ratio = min(max(x(h(g)),0),1); + +% if ratio is 1 then better pick the parent +if ratio == 1 + [node,dummy] = find(tr.tree==node); %#ok + node = node + numLeaves; + ratio = 0; +end + +% change the ratio (x) to the distance to the selected node +distance = ratio .* tr.dist(node); + diff --git a/matlab/graph/allfreqsnew2.m b/matlab/graph/allfreqsnew2.m new file mode 100644 index 0000000..6126e8a --- /dev/null +++ b/matlab/graph/allfreqsnew2.m @@ -0,0 +1,90 @@ +function [counts,noalle_est,prior,adjprior,rawalleles] = allfreqsnew2(rawdata, noalle_est) +% Filename: allfreqsnew2.m +% [counts,noalle,prior,adjprior,rawalleles] = allfreqsnew(rawdata) +% +% Description: +% rawdata has n rows (2 x #individuals) and n(l) first +% colums are the loci. The last column is the subpopindex +% prior is a created matrix of positive Dirichlet hyperparameters +% missing data is filtered out +% !!!NEW!!!zeros are accepted as allele codes and any negative numbers as missing data. + +% Modified by: Jing Tang +SCALE = 1; +dime=size(rawdata); +noalle=zeros(dime(2)-1,1); +rawalleles=cell(1,dime(2)-1); +for i=1:dime(2)-1 + noalle(i)=length(unique(rawdata(:,i))); +end +for i=1:dime(2)-1 + if length(find(rawdata(:,i)<=0))>0 + noalle(i)=noalle(i)-1; + end +end + +% Fomulate the raw data such that the value i in a entry denotes the ith +% alleles. +for i=1:dime(2)-1 + rawalles=unique(rawdata(:,i)); + % rawalles = [1:noalle(i)]'; + if rawalles(1)<=0 + rawalles(1)=-999; + end + rawalleles{i} = rawalles; %rawalleles!!! + if rawalles(1)<0 + for j=2:noalle(i)+1 + %rawdata(find(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*(j-1); + rawdata(logical(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*(j-1); + end + else + for j=1:noalle(i) + % rawdata(find(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*j; + rawdata(logical(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*j; + end + end +end + +% ALLOWED_MEMORY = 50; % in unit of megabyte. +% n1 = max(noalle_est); +% n2 = dime(2)-1; +% n3 = double(max(rawdata(:,dime(2)))); +% ncells = n1*n2*n3; +% memory_used = ncells/(1024*1024); % using uint8 format. +% if memory_used < ALLOWED_MEMORY +% counts=zeros(n1,n2,n3,'uint8'); +% else +% nbatches = ceil(memory_used/ALLOWED_MEMORY); +% n3_in_batch = ceil(n3/nbatches); +% counts = cell(nbatches,1); +% for i=1:nbatches-1 +% % counts = cat(3,counts,uint16(zeros(n1,n2,n3_in_batch))); +% counts{i} = zeros(n1,n2,n3_in_batch,'uint8'); +% end +% % counts = cat(3, counts, uint16(zeros(n1,n2,n3-n3_in_batch*(nbatches-1)))); +% counts{i} = zeros(n1,n2,n3-n3_in_batch*(nbatches-1),'uint8'); +% end + + + +counts = zeros(max(noalle_est),dime(2)-1,max(rawdata(:,dime(2))),'uint8'); +for i=1:dime(1) + for j=1:dime(2)-1 + if rawdata(i,j)>0 + counts(rawdata(i,j),j,rawdata(i,dime(2)))=... + counts(rawdata(i,j),j,rawdata(i,dime(2)))+1; + end + end +end + +maxnoalle = max(noalle_est); +% prior = []; +prior=zeros(maxnoalle,dime(2)-1); +for i=1:dime(2)-1 + prior(:,i) = [SCALE*ones(noalle_est(i),1)/noalle_est(i);zeros(maxnoalle-noalle_est(i),1)]; +end + +adjprior=prior; +for i=1:dime(2)-1 + adjprior(:,i)=adjprior(:,i)+[zeros(noalle_est(i),1);ones(maxnoalle-noalle_est(i),1)]; +end diff --git a/matlab/graph/chooseDistance.m b/matlab/graph/chooseDistance.m new file mode 100644 index 0000000..7de4105 --- /dev/null +++ b/matlab/graph/chooseDistance.m @@ -0,0 +1,242 @@ +function [D]=chooseDistance(aln) +% CHOOSEDISTANCE Display a quest dialog for choosing a genetic distance +% input: a structure containing part of variables from the mixture result. +% ouput: a symmetric distance matrix + +if (isempty(aln)) + error('Need input.'); +end + +if aln.npops == 1 + disp('*** ERROR: no population structure is found.'); + return +end + +ButtonName=questdlg('Choose a type of genetic distance?', ... + 'Select genetic distance', ... + 'KL','Nei','Hamming','KL'); + +switch ButtonName, + case 'KL' + disp('Using KL distance.'); + D = dn_kl(aln); + case 'Nei' + disp('Using Nei distance.'); + D = dn_nei(aln); + case 'Hamming' + disp('Using Hamming distance. Please wait...'); + D = dn_hamming(aln); +% case 'LogDet' +% disp('Using LogDet distance'); +% D= dn_logdet(aln); + otherwise + D = []; +end + +D = D + D'; % make it symmetric +%-------------------------------------------------------------------------- +% SUBFUNCTIONS +%-------------------------------------------------------------------------- +function [dist_mat] = dn_kl(aln) +npops = aln.npops; +COUNTS = aln.COUNTS; +adjprior = aln.adjprior; +% data = noIndex(aln.data, aln.noalle); +% partition = aln.partition; + +dist_mat = zeros(npops, npops); +maxnoalle = size(COUNTS,1); +nloci = size(COUNTS,2); +d = zeros(maxnoalle, nloci, npops); + +prior = adjprior; +prior(find(prior==1))=0; +nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yhtä alleelia. +prior(1,nollia)=1; +for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); +end + +for pop1 = 1:npops + % rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + dist_mat(pop1,pop2) = div; + end +end + +%-------------------------------------------------------------------------- +function [dist_mat] = dn_nei(aln) + +npops = aln.npops; +COUNTS = aln.COUNTS; +% adjprior = aln.adjprior; +% data = aln.data; +% partition = aln.partition; + +dist_mat = zeros(npops, npops); +maxnoalle = size(COUNTS,1); +nloci = size(COUNTS,2); +d = zeros(maxnoalle, nloci, npops); + +for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); +end +for pop1 = 1:npops + % rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div1 = sum(sum(dist1.*dist2)); + div2 = sqrt(sum(sum(dist1.^2)))*sqrt(sum(sum(dist2.^2))); + div = -log(div1/div2); + dist_mat(pop1,pop2) = div; + end +end + +%-------------------------------------------------------------------------- +function [dist_mat] = dn_hamming(aln) + +npops = aln.npops; +data = noIndex(aln.data, aln.noalle); +partition = aln.partition; +dist_mat = zeros(npops, npops); +for pop1 = 1:npops + for pop2 = 1:pop1-1 + dist_mat(pop1,pop2) = hamming_dist(data(logical(partition==pop1),[1:end-1]),... + data(logical(partition==pop2),[1:end-1])); + end +end + +function dist = hamming_dist(data1,data2) +[length1,nloci] = size(data1); +length2 = size(data2,1); +dist1 = 0; +for i = 1:length1 + dist2 = 0; + for j = 1:length2 + dist2 = dist2 + sum(data1(i,:)~=data2(j,:))/nloci; + end + dist1 = dist1 + dist2/length2; +end +dist = dist1/length1; + +%-------------------------------------------------------------------------- +function [D]=dn_logdet(aln) +%DN_LOGDET - Log-det (paralinear) distance +%The LogDet model computes the distance from the determinant of the matrix of +%co-occurrence of nucleotides in the two species, according to the formula +% +% D = - 1/4(loge(|F|) - 1/2loge(fA1 fC1 fG1 fT1 fA2 fC2 fG2 fT2)) +% +%Where F is a matrix whose (i,j) element is the fraction of sites at which base +%i occurs in one species and base j occurs in the other. fji is the fraction of +%sites at which species i has base j. The LogDet distance cannot cope with +%ambiguity codes. It must have completely defined sequences. One limitation of +%the LogDet distance is that it may be infinite sometimes, if there are too many +%changes between certain pairs of nucleotides. This can be particularly +%noticeable with distances computed from bootstrapped sequences. +% +% Syntax: [D]=dn_logdet(aln) +% +% Inputs: +% aln - Alignment structure +% +% Outputs: +% D - Distance matrix +% +% See also: + +% Molecular Biology & Evolution Toolbox, (C) 2006 +% Author: James J. Cai +% Email: jamescai@hku.hk +% Website: http://bioinformatics.org/mbetoolbox/ +% Last revision: 5/3/2006 + +if (isstruct(aln)), + S=aln.data; +else + S=aln; +end + +[n,m] = size(S); +D = zeros(n,n); + +for i=1:n-1 + for j=i+1:n + D(i,j) = d_logdet(S(i,:), S(j,:)); + D(j,i) = D(i,j); + end +end + +%-------------------------------------------------------------------------- +function d=d_logdet(seq1, seq2) +[S,gap] = countntchange(seq1, seq2); +% S=count_ntchanges(seq1, seq2); +if (det(S)<=0) + d=inf; +else + f1=sum(S); + f2=sum(S'); + d=(-1/4)*(log(det(S))-(1/2)*log( prod(f1)*prod(f2))); +end + +%-------------------------------------------------------------------------- +function [D,gap]=countntchange(s1,s2) +%COUNTNTCHANGE - Count nucleotide changes in two DNA sequences +%D is a 4x4 array, with bases in seq1 along top, seq2 along side, +%in order A,C,G,T. +% +% Syntax: [D,gap]=countntchange(s1,s2) +% +% Inputs: +% s1 - Sequence 1 vector +% s2 - Sequence 2 vector +% +% Outputs: +% D - Codon Adaptation Index value +% gap - Codon Adaptation Index value +% +% +% See also: COUNTAACHANGE COUNTCDCHANGE + +% Molecular Biology & Evolution Toolbox, (C) 2006 +% Author: James J. Cai +% Email: jamescai@hku.hk +% Website: http://bioinformatics.org/mbetoolbox/ +% Last revision: 3/28/2006 + +if (nargout>1) + [D,gap]=countchange(s1,s2,4); +else + [D]=countchange(s1,s2,4); +end + +% ------------------------------------------------------------------------- +function [D,gap] = countchange(s1,s2,nword) + +if ~(ismember(nword, [4 20 61])), + error('Wrong NWORD') +end +if (length(s1)~=length(s2)), + error('Sequences are not of same length.') +end + +D=zeros(nword); +WORD=1:nword; + +for j=1:nword + s1sites=(s1==WORD(j)); + for i=1:nword + D(i,j)=sum(s1sites & (s2==WORD(i))); + end +end + +if (nargout>1), + gap = length(s1)-sum(D(:)); % gaps +end + diff --git a/matlab/graph/double2labels.m b/matlab/graph/double2labels.m new file mode 100644 index 0000000..eeba8c4 --- /dev/null +++ b/matlab/graph/double2labels.m @@ -0,0 +1,15 @@ +function labels = double2labels(B) +% DOUBLE2LABELS Transforms a double matrix into a cell array of strings. + +[n m] = size(B); +labels = cell(n, m); + +for i = 1:n + for j = 1:m + if B(i, j) == 0 + labels{i, j} = ''; + else + labels{i, j} = sprintf('%.2g', B(i, j)); + end + end +end diff --git a/matlab/graph/giveColors.m b/matlab/graph/giveColors.m new file mode 100644 index 0000000..44b165d --- /dev/null +++ b/matlab/graph/giveColors.m @@ -0,0 +1,14 @@ +function [colors] = giveColors(n) +if n > 36 + error('Maximum number of colors 36'); +end +colors = [1 0 0; 0 1 0; 0 0 1; 1 1 0 ; 1 0 1; 0 1 1; ... + 0.4 0 0; 0 0.4 0; 0 0 0.4; 0.4 0.4 0; 0.4 0 0.4; 0 0.4 0.4; ... + 0.2 0 0; 0 0.2 0; 0 0 0.2; 0.2 0.2 0; 0.2 0 0.2; 0 0.2 0.2; ... + 0.8 0 0; 0 0.8 0; 0 0 0.8; 0.8 0.8 0; 0.8 0 0.8; 0 0.8 0.8; ... + 0.6 0 0; 0 0.6 0; 0 0 0.6; 0.6 0.6 0; 0.6 0 0.6; 0 0.6 0.6; ... + 0.6 0.2 0.4; 0.2 0.4 0.8; 0.8 0.4 0.2; 0 0.6 0.2; 0.2 0.8 0.6; 0.5 0.2 0.1; ... + 0.6 0.3 0.1]; +colors = colors(1:n,:); +% red; green; blue; yellow +% RGB format: [red green blue] diff --git a/matlab/graph/graph2dot.m b/matlab/graph/graph2dot.m new file mode 100644 index 0000000..b7a9551 --- /dev/null +++ b/matlab/graph/graph2dot.m @@ -0,0 +1,139 @@ +function graph2dot(adj, filename, varargin) +% GRAPH2DOT Writes a dot file representing an adjacency matrix. +% +% Syntax: graph2dot(adj, filename, ...) +% +% Required input arguments: +% adj - the adjacency matrix of the graph. +% filename - name for the dot file. +% +% Optional input arguments in name, value pairs [default]: +% 'nodelabels' - nodelabels{i} is a string attached to the node i ['i'] +% 'nodeshapes' - cell array of node shapes ['ellipse'] +% 'nodestyles' - cell array of node styles ['solid'] +% 'arclabels' - arclabels{i,j} is a string attached to the i-j arc [''] +% 'arcstyles' - cell array of arc styles ['solid'] +% 'width' - maximum width in inches [10] +% 'height' - maximum height in inches [10] +% 'leftright' - true for left to right layout, false for top-down [false] +% + +% This funtion is adapted from 'graph_to_dot' in GraphViz Matlab +% interface by Kevin Murphy, available from +% http://www.cs.ubc.ca/~murphyk/Software/GraphViz/graphviz.html +% +% First version written by Kevin Murphy 2002. +% Modified by Leon Peshkin, Jan 2004. +% Bugfix by Tom Minka, Mar 2004. +% Modified by Antti Kerminen, Aug 2005. +% - added handling of arc labels +% - added possibility to use bold, dashed, and red arc styles +% - added support for Octave +% - changed filename from optional to required argument +% - removed all code related to undirected arcs +% - some changes in documentation and code layout +% Modified by Antti Kerminen, Nov 2005. +% - added new input arguments: nodeshapes, nodestyles, arcstyles, +% pos +% - minor code refactoring +% Modified by Antti Kerminen, Mar 2006 +% - removed input arguments: boldarcs, dashedarcs, redarcs, pos + +% Check if we are running Octave +isoctave = exist('OCTAVE_VERSION'); + +nnodes = size(adj, 1); + +% Set default args +nodelabels = cell(nnodes, 1); +for i = 1:nnodes + nodelabels{i} = num2str(i); +end +nodeshapes = cell(nnodes, 1); +nodeshapes(:) = {'ellipse'}; +nodestyles = cell(nnodes, 1); +nodestyles(:) = {'solid'}; +arclabels = []; +arcstyles = cell(nnodes); +arccolors = cell(nnodes); +arccolors(:) = {'black'}; +width = 4*nnodes; +height = 8*nnodes; +leftright = 0; + +% Get optional args +for i = 1:2:(nargin - 2) + switch varargin{i} + case 'nodelabels', nodelabels = varargin{i + 1}; + case 'nodeshapes', nodeshapes = varargin{i + 1}; + case 'nodestyles', nodestyles = varargin{i + 1}; + case 'arclabels', arclabels = varargin{i + 1}; + case 'arcstyles', arcstyles = varargin{i + 1}; + case 'arccolors', arccolors = varargin{i + 1}; + case 'width', width = varargin{i + 1}; + case 'height', height = varargin{i + 1}; + case 'leftright', leftright = varargin{i + 1}; + case 'graphvizpath', graphvizpath = varargin{i+1}; + case 'nodecolors', nodecolors = varargin{i+1}; + end +end + +w = cd; +cd(graphvizpath); + +% Construct a format string for nodes +nodeformat = ' %d [label = "%s", shape = %s, style = %s, color = "%s", fontcolor="white"];\n'; + +% Construct a format string for edges +if isempty(arclabels) + attributes = 'style = %s, color = "%s"'; +else + attributes = 'style = %s, color = "%s", label = "%s"'; +end + +if isoctave % no need/support for brackets + edgeformat = strcat(' %d -> %d [', attributes, '];\n'); +else + edgeformat = strcat([' %d -> %d [', attributes, '];\n']); +end + +% Write the file +fid = fopen(filename, 'w'); + +fprintf(fid, 'digraph G {\n'); +fprintf(fid, ' center = 1;\n'); +fprintf(fid, ' size = \"%d, %d\";\n', width, height); +% fprintf(fid, ' ratio=\"compress\";\n'); +% fprintf(fid, ' raio = fill;\n'); +fprintf(fid, ' node[fontsize=24]'); + +if leftright + fprintf(fid, ' rankdir = LR;\n'); +end + +% Write nodes +for node = 1:nnodes + fprintf(fid, nodeformat, node, nodelabels{node}, nodeshapes{node}, ... + nodestyles{node}, nodecolors{node}); +end + +% Write edges +for node1 = 1:nnodes + arcs = find(adj(node1, :)); + for node2 = arcs + style = arcstyles{node1, node2}; + color = arccolors{node1, node2}; + if isempty(arclabels) + fprintf(fid, edgeformat, node1, node2, style, color); + else + fprintf(fid, edgeformat, node1, node2, style, color, ... + arclabels{node1, node2}); + end + end +end + +fprintf(fid, '}'); +fclose(fid); +fclose('all'); % enfore closing the file anyway + +cd(w); diff --git a/matlab/graph/graphvis2.m b/matlab/graph/graphvis2.m new file mode 100644 index 0000000..fbda391 --- /dev/null +++ b/matlab/graph/graphvis2.m @@ -0,0 +1,108 @@ +function graphvis2(adj_mat, linkage_file_name, inliers, graphviz_path, npops) +% Filename: graphvis2.m +% graphvis2(adj_mat, linkage_file_name) +% +% Description: +% Graph visualization given the adjacency matrix. + +% Author: Jing Tang +% Modified date: 01/06/2005 + +% Input: +% adj_mat: The adjacency matrix. +% linkage_file_name: + +% Output: +% + +% Ensure adj_mat is symmetric and square. + +[n,m] = size(adj_mat); +if n ~= m, error ('Adjacency matrix must be square'), end; +%if ~all(diag(adj_mat)), error('The diagonal must be nonzero'), end; + +npops_in = length(inliers); +groupnames = cell(1,npops_in); +nodecolors = cell(npops_in,1); +allnode_color = giveColors(npops); +arccolors = cell(npops_in); +adjmat = adj_mat ~= 0; +for i=1:npops_in + groupnames{i} = sprintf('Cluster %d',inliers(i)); + nodecolors{i} = num2str(rgb2hsv(allnode_color(inliers(i),:))); + arccolors(i,adjmat(i,:)) = {nodecolors{i}}; +end +nodestyles = cell(npops_in, 1); +nodestyles(:) = {'filled'}; +arcstyles = cell(npops_in); +arcstyles(adjmat) = {'filled'}; + +handle = plotmodel(adj_mat',[1:npops_in],'graphvizpath', graphviz_path, ... + 'nodecolors', nodecolors, ... + 'nodestyles', nodestyles, ... + 'arccolors',arccolors, ... + 'arcstyles',arcstyles, ... + 'varnames', groupnames); + +% set(handle,'menubar','none','numbertitle','off','toolbar','figure'); +set(handle,'numbertitle','off','toolbar','figure'); +m = findall(gcf,'type','uimenu'); +set(m([1:7]),'Visible', 'off'); + +h1 = uimenu('Parent',handle, ... + 'Label','Attributes', ... + 'Tag','attr_menu'); +h2 = uimenu('Parent',h1, ... + 'Label','Rename clusters', ... + 'callback', 'plotflow rename', ... + 'Tag','clustername_menu'); +h3 = uimenu('Parent',h1, ... + 'Label','Prune edges', ... + 'callback','plotflow prune', ... + 'Tag','edge_menu'); +h4 = uimenu('Parent',handle, ... + 'Label','Help', ... + 'callback', 'plotflow help', ... + 'Tag','help_menu'); +% h5 = uimenu('Parent',h1, ... +% 'Callback','baps4cbf about', ... +% 'Enable','on', ... +% 'Label','About', ... +% 'Tag','about_menu'); +set(handle,'Name',[' Gene flow - ' linkage_file_name ]); + +% save the parameters +g.handle = handle; +g.adjmat = adj_mat'; +g.adjmat2 = g.adjmat; +g.k = [1:npops_in]; +g.graphvizpath = graphviz_path; +g.nodecolors = nodecolors; +g.nodestyles = nodestyles; +g.arccolors = arccolors; +g.arcstyles = arcstyles; +g.varnames = groupnames; +g.type = 'GENEFLOW'; +set(h1,'Userdata',g); % store in the attribute menu + + + +% Old version + +% Decide the coordinate of each node. +% [p,p,r] = dmperm(adj_mat); +% nnodes = length(adj_mat); +% nblocks = length(r)-1; +% [B,ix]=sort(p,2); +% Coordinates=[[ix'],zeros(nnodes,1)]; +% +% % Plot the graph structure +% clf reset +% set(gcf, 'color', 'white', 'menubar', 'none', 'numbertitle','off','name', 'Graphical model') +% gplot(adj_mat, Coordinates, '-ob'); +% h=findobj(gca, 'type','line'); +% set(h,'markersize',10) +% xlim([0 nnodes+1]); +% axis off +% text(Coordinates(:,1),Coordinates(:,2)-0.1, int2str(B(:))); +%drawnow diff --git a/matlab/graph/ksdensity_myown.m b/matlab/graph/ksdensity_myown.m new file mode 100644 index 0000000..49cb3ac --- /dev/null +++ b/matlab/graph/ksdensity_myown.m @@ -0,0 +1,623 @@ +function [fout0,xout,u]=ksdensity(yData,varargin) +%KSDENSITY Compute kernel density or distribution estimate +% [F,XI]=KSDENSITY(X) computes a probability density estimate of the sample +% in the vector X. KSDENSITY evaluates the density estimate at 100 points +% covering the range of the data. F is the vector of density values and XI +% is the set of 100 points. The estimate is based on a normal kernel +% function, using a window parameter (bandwidth) that is a function of the +% number of points in X. +% +% F=KSDENSITY(X,XI) specifies the vector XI of values where the density +% estimate is to be evaluated. +% +% [F,XI,U]=KSDENSITY(...) also returns the bandwidth of the kernel smoothing +% window. +% +% KSDENSITY(...) without output arguments produces a plot of the results. +% +% KSDENSITY(AX,...) plots into axes AX instead of GCA. +% +% [...]=KSDENSITY(...,'PARAM1',val1,'PARAM2',val2,...) specifies parameter +% name/value pairs to control the density estimation. Valid parameters +% are the following: +% +% Parameter Value +% 'censoring' A logical vector of the same length of X, indicating which +% entries are censoring times (default is no censoring). +% 'kernel' The type of kernel smoother to use, chosen from among +% 'normal' (default), 'box', 'triangle', and +% 'epanechnikov'. +% 'npoints' The number of equally-spaced points in XI. +% 'support' Either 'unbounded' (default) if the density can extend +% over the whole real line, or 'positive' to restrict it to +% positive values, or a two-element vector giving finite +% lower and upper limits for the support of the density. +% 'weights' Vector of the same length as X, giving the weight to +% assign to each X value (default is equal weights). +% 'width' The bandwidth of the kernel smoothing window. The default +% is optimal for estimating normal densities, but you +% may want to choose a smaller value to reveal features +% such as multiple modes. +% 'function' The function type to estimate, chosen from among 'pdf', +% 'cdf', 'icdf', 'survivor', or 'cumhazard' for the density, +% cumulative probability, inverse cumulative probability, +% survivor, or cumulative hazard functions, respectively. +% +% In place of the kernel functions listed above, you can specify another +% kernel function by using @ (such as @normpdf) or quotes (such as 'normpdf'). +% The function must take a single argument that is an array of distances +% between data values and places where the density is evaluated, and +% return an array of the same size containing corresponding values of +% the kernel function. When the 'function' parameter value is 'pdf', +% this kernel function should return density values, otherwise it should +% return cumulative probability values. Specifying a custom kernel when the +% 'function' parameter value is 'icdf' is an error. +% +% If the 'support' parameter is 'positive', KSDENSITY transforms X using +% a log function, estimates the density of the transformed values, and +% transforms back to the original scale. If 'support' is a vector [L U], +% KSDENSITY uses the transformation log((X-L)/(U-X)). The 'width' parameter +% and U outputs are on the scale of the transformed values. +% +% Example: +% x = [randn(30,1); 5+randn(30,1)]; +% [f,xi] = ksdensity(x); +% plot(xi,f); +% This example generates a mixture of two normal distributions, and +% plots the estimated density. +% +% See also HIST, @. + +% If there is any censoring, we would like to estimate the density up +% to the last non-censored observation. Say this is XMAX. Without +% censoring, the density estimate near XMAX would consist of contributions +% from kernels centered above and below XMAX. We can't compute the +% contributions above XMAX, though, because we have no data. Using only +% the kernels centered below XMAX makes the density estimate biased. +% +% In an attempt to reduce bias, we will compute the contributions +% from kernels centered below XMAX, and fold their values around XMAX. +% The result should be good if the density is nearly flat in this area. +% If the density is increasing then the estimate will still be biased +% downward, and if the density is decreasing it will still be biased +% upward, but the bias will be reduced. + +% Reference: +% A.W. Bowman and A. Azzalini (1997), "Applied Smoothing +% Techniques for Data Analysis," Oxford University Press. + +% Copyright 1993-2006 The MathWorks, Inc. +% $Revision: 1.9.6.8.2.1 $ $Date: 2006/07/13 16:53:59 $ + +if (nargin > 0) && isscalar(yData) && ishandle(yData) ... + && isequal(get(yData,'type'),'axes') + axarg = {yData}; + if nargin>1 + yData = varargin{1}; + varargin(1) = []; + else + yData = []; % error to be dealt with below + end +else + axarg = {}; +end + +% Get y vector and its dimensions +if ~isvector(yData) + error('stats:ksdensity:VectorRequired','X must be a vector.'); +end +yData = yData(:); +yData(isnan(yData)) = []; +n = length(yData); +ymin = min(yData); +ymax = max(yData); + +% Maybe xi was specified, or maybe not +xispecified = false; +if ~isempty(varargin) + if ~ischar(varargin{1}) + xi = varargin{1}; + varargin(1) = []; + xispecified = true; + end +end + +% Process additional name/value pair arguments +okargs={'width' 'npoints' 'kernel' 'support' ... + 'weights' 'censoring' 'cutoff' 'function' }; +defaults = {[] [] 'normal' 'unbounded' ... + 1/n false(n,1) [] 'pdf'}; +[eid,emsg,u,m,kernelname,support,weight,cens,cutoff,ftype] = ... + statgetargs(okargs, defaults, varargin{:}); +if ~isempty(eid) + error(sprintf('stats:ksdensity:%s',eid),emsg); +end + +if isnumeric(support) + if numel(support)~=2 + error('stats:ksdensity:BadSupport',... + 'Value of ''support'' parameter must have two elements.'); + end + if support(1)>=ymin || support(2)<=ymax + error('stats:ksdensity:BadSupport',... + 'Data values must be between lower and upper ''support'' values.'); + end + L = support(1); + U = support(2); +elseif ischar(support) && length(support)>0 + okvals = {'unbounded' 'positive'}; + rownum = strmatch(support,okvals); + if isempty(rownum) + error('stats:ksdensity:BadSupport',... + 'Invalid value of ''support'' parameter.') + end + support = okvals{rownum}; + if isequal(support,'unbounded') + L = -Inf; + U = Inf; + else + L = 0; + U = Inf; + end + if isequal(support,'positive') && ymin<=0 + error('stats:ksdensity:BadSupport',... + 'Cannot set support to ''positive'' with non-positive data.') + end +else + error('stats:ksdensity:BadSupport',... + 'Invalid value of ''support'' parameter.') +end +if isempty(weight) + weight = ones(1,n); +elseif numel(weight)==1 + weight = repmat(weight,1,n); +elseif numel(weight)~=n || numel(weight)>length(weight) + error('stats:ksdensity:InputSizeMismatch',... + 'Value of ''weight'' must be a vector of the same length as X.'); +else + weight = weight(:)'; +end +weight = weight / sum(weight); +if isempty(cens) + cens = false(1,n); +elseif ~all(ismember(cens(:),0:1)) + error('stats:ksdensity:BadCensoring',... + 'Value of ''censoring'' must be a logical vector.'); +elseif numel(cens)~=n || numel(cens)>length(cens) + error('stats:ksdensity:InputSizeMismatch',... + 'Value of ''censoring'' must be a vector of the same length as X.'); +end + +% Kernel can be the name of a function local to here, or an external function +% Kernel numbers are used below: +% 1 2 3 4 5 +kernelnames = {'normal' 'epanechinikov' 'epanechnikov' 'box' 'triangle'}; +kernelhndls = {@normal @epanechnikov @epanechnikov @box @triangle}; +cdfhndls = {@cdf_nl @cdf_ep @cdf_ep @cdf_bx @cdf_tr}; +kernelcuts = [4 sqrt(5) sqrt(5) sqrt(3) sqrt(6)]; +kernelcutoff = Inf; + +% Check function type +okvals = {'pdf' 'cdf' 'survivor' 'cumhazard' 'icdf'}; +if ischar(ftype) + ftype = lower(ftype); +end +rownum = strmatch(ftype,okvals); +if isempty(rownum) + error('stats:ksdensity:BadFunction',... + 'Invalid value of ''function'' parameter.') +elseif length(rownum)>1 + error('stats:ksdensity:BadFunction',... + 'Ambiguous value of ''function'' parameter.') +end +ftype = okvals{rownum}; + +% Set a flag indicating we are to compute the cdf; later on +% we may transform to another function that is a transformation +% of the cdf +iscdf = isequal(ftype,'cdf') | isequal(ftype,'survivor') ... + | isequal(ftype,'cumhazard'); +kernel = kernelname; +if isempty(kernelname) + if iscdf + kernel = cdfhndls{1}; + else + kernel = kernelhndls{1}; + end + kernelname = kernelnames{1}; + kernelcutoff = kernelcuts(1); +elseif ~(isa(kernelname,'function_handle') || isa(kernelname,'inline')) + if ~ischar(kernelname) + error('stats:ksdensity:BadKernel',... + 'Smoothing kernel must be a function.'); + end + + % If this is an abbreviation of our own methods, expand the name now. + % If the string matches the start of both variants of the Epanechnikov + % spelling, that is not an error so pretend it matches just one. + knum = strmatch(lower(kernelname), kernelnames); + if all(ismember(2:3,knum)) % kernel number used here + knum(knum==3) = []; + end + if (length(knum) == 1) + if iscdf + kernel = cdfhndls{knum}; + else + kernel = kernelhndls{knum}; + end + kernelcutoff = kernelcuts(knum); + elseif isequal(ftype,'icdf') + error('stats:ksdensity:IcdfNotAllowed',... + 'Cannot compute inverse cdf for a custom kernel.'); + end +elseif isequal(ftype,'icdf') + error('stats:ksdensity:IcdfNotAllowed',... + 'Cannot compute inverse cdf for a custom kernel.'); +end +if ~isempty(cutoff) + kernelcutoff = cutoff; +end + +if isequal(ftype,'icdf') + % Inverse cdf is special, so deal with it here + if xispecified + p = xi; + else + p = (1:99)/100; + end + + % To start, evaluate the cdf at a range of values + sy = sort(yData); + xi = linspace(sy(1), sy(end), 100); + [yi,xi,u] = ksdensity(yData,xi, 'censoring',cens, 'kernel',kernelname, ... + 'support',support, 'weights',weight,... + 'width',u, 'function','cdf'); + + % If there's a gap of about 0 density, we have to adjust things + gap = find(diff(sy) > 4*u); + if ~isempty(gap) + sy = sy(:)'; + xi = sort([xi, sy(gap)+2*u, sy(gap+1)-2*u]); + [yi,xi,u] = ksdensity(yData,xi, 'censoring',cens, 'kernel',kernelname, ... + 'support',support, 'weights',weight,... + 'width',u, 'function','cdf'); + end + + % Perturb duplicates a small amount to allow interpolation + t = 1 + find(diff(yi)==0); + bigger = 1+eps(class(yi)); + for j=1:length(t) + yi(t(j)) = bigger*yi(t(j)-1); + end + + % Get some starting values + x1 = interp1(yi,xi,p); % interpolate for p in a good range + x1(isnan(x1) & p0 + x1(isnan(x1) & p>max(yi)) = max(xi); % use highest x if p too high, but <1 + x1(p<=0) = L; % use lower bound if p<=0 + x1(p>=1) = U; % and upper bound if p>=1 + notdone = find(p>0 & p<1); % refine the ones with 0 1e-6*abs(x0) & ... + abs(p(notdone)-f0) > 1e-8); + end + + % Plot or return these values + if nargout==0 + plot(axarg{:},x1,p); + else + fout0 = x1; + xout = p; + end + return +end + +% Compute transformed values of data +if isequal(support,'unbounded') + ty = yData; +elseif isequal(support,'positive') + ty = log(yData); +else + ty = log(yData-L) - log(U-yData); % same as log((y-L)./(U-y)) +end + +% Deal with censoring +iscensored = any(cens); +if iscensored + % Compute empirical cdf and create an equivalent weighted sample + [F,XF] = ecdf(ty, 'censoring',cens, 'frequency',weight); + weight = diff(F(:)'); + ty = XF(2:end); + n = length(ty); + N = sum(~cens); + issubdist = (F(end)<1); % sub-distribution, integrates to less than 1 + ymax = max(yData(~cens)); +else + N = n; + issubdist = false; +end + +% Get bandwidth if not already specified +if (isempty(u)), + if ~iscensored + % Get a robust estimate of sigma + med = median(ty); + sig = median(abs(ty-med)) / 0.6745; + else + % Estimate sigma using quantiles from the empirical cdf + Xquant = interp1(F,XF,[.25 .5 .75]); + if ~any(isnan(Xquant)) + % Use interquartile range to estimate sigma + sig = (Xquant(3) - Xquant(1)) / (2*0.6745); + elseif ~isnan(Xquant(2)) + % Use lower half only, if upper half is not available + sig = (Xquant(2) - Xquant(1)) / 0.6745; + else + % Can't easily estimate sigma, just get some indication of spread + sig = ty(end) - ty(1); + end + end + if sig<=0, sig = max(ty)-min(ty); end + if sig>0 + % Default window parameter is optimal for normal distribution + u = sig * (4/(3*N))^(1/5); + else + u = 1; + end +end + +% Get XI values at which to evaluate the density +foldwidth = min(kernelcutoff,3); +if ~xispecified + % Compute untransformed values of lower and upper evaluation points + ximin = min(ty) - foldwidth*u; + if issubdist + ximax = max(ty); + else + ximax = max(ty) + foldwidth*u; + end + + if isequal(support,'positive') + ximin = exp(ximin); + ximax = exp(ximax); + elseif ~isequal(support,'unbounded') + ximin = (U*exp(ximin)+L) / (exp(ximin)+1); + ximax = (U*exp(ximax)+L) / (exp(ximax)+1); + end + + if isempty(m) + m=100; + end + + xi = linspace(ximin, ximax, m); + +elseif (numel(xi) > length(xi)) + error('stats:ksdensity:VectorRequired','XI must be a vector'); +end + +% Compute transformed values of evaluation points that are in bounds +xisize = size(xi); +fout = zeros(xisize); +if iscdf && isfinite(U) + fout(xi>=U) = 1; +end +xout = xi; +xi = xi(:); +if isequal(support,'unbounded') + inbounds = true(size(xi)); + txi = xi; + foldpoint = ymax; +elseif isequal(support,'positive') + inbounds = (xi>0); + xi = xi(inbounds); + txi = log(xi); + foldpoint = log(ymax); +else + inbounds = (xi>L) & (xi= foldpoint - foldwidth*u); + nkeep = length(txi); + nfold = sum(needfold); + txifold = (2*foldpoint) - txi(needfold); + txi(end+1:end+nfold) = txifold; + m = length(txi); +else + nkeep = length(txi); + nfold = 0; +end + +% Now compute density estimate at selected points +blocksize = 3e4; +if n*m<=blocksize && ~iscdf + % For small problems, compute kernel density estimate in one operation + z = (repmat(txi',n,1)-repmat(ty,1,m))/u; + f = weight * feval(kernel, z); +else + % For large problems, try more selective looping + + % First sort y and carry along weights + [ty,idx] = sort(ty); + weight = weight(idx); + + % Loop over evaluation points + f = zeros(1,m); + + if isinf(kernelcutoff) + for k=1:m + % Sum contributions from all + z = (txi(k)-ty)/u; + f(k) = weight * feval(kernel,z); + end + else + % Sort evaluation points and remember their indices + [stxi,idx] = sort(txi); + + jstart = 1; % lowest nearby point + jend = 1; % highest nearby point + for k=1:m + % Find nearby data points for current evaluation point + halfwidth = kernelcutoff*u; + lo = stxi(k) - halfwidth; + while(ty(jstart)0 + % Fold back over the censoring point to give a crisp upper limit + ffold = f(nkeep+1:end); + f = f(1:nkeep); + f(needfold) = f(needfold) + ffold; + txi = txi(1:nkeep); + f(txi>foldpoint) = 0; + + % Include a vertical line at the end + if ~xispecified + xi(end+1) = xi(end); + f(end+1) = 0; + inbounds(end+1) = true; + end +end + + +if iscdf + % Guard against roundoff. Lower boundary of 0 should be no problem. + f = min(1,f); +else + % Apply reverse transformation and create return value of proper size + f = f(:) ./ u; + if isequal(support,'positive') + f = f ./ xi; + elseif isnumeric(support) + f = f * (U-L) ./ ((xi-L) .* (U-xi)); + end +end +fout(inbounds) = f; +xout(inbounds) = xi; + + +% If another function based on the cdf, compute it now +if isequal(ftype,'survivor') + fout = 1-fout; +elseif isequal(ftype,'cumhazard') + fout = 1-fout; + t = (fout>0); + fout(~t) = NaN; + fout(t) = -log(fout(t)); +end + +% Plot the results if they are not requested as return values +if nargout==0 + plot(axarg{:},xout,fout) +else + fout0 = fout; +end + +% ----------------------------- +% The following are functions that define smoothing kernels k(z). +% Each function takes a single input Z and returns the value of +% the smoothing kernel. These sample kernels are designed to +% produce outputs that are somewhat comparable (differences due +% to shape rather than scale), so they are all probability +% density functions with unit variance. +% +% The density estimate has the form +% f(x;k,h) = mean over i=1:n of k((x-y(i))/h) / h + +function f = normal(z) +%NORMAL Normal density kernel. +%f = normpdf(z); +f = exp(-0.5 * z .^2) ./ sqrt(2*pi); + +function f = epanechnikov(z) +%EPANECHNIKOV Epanechnikov's asymptotically optimal kernel. +a = sqrt(5); +z = max(-a, min(z,a)); +f = max(0,.75 * (1 - .2*z.^2) / a); + +function f = box(z) +%BOX Box-shaped kernel +a = sqrt(3); +f = (abs(z)<=a) ./ (2 * a); + +function f = triangle(z) +%TRIANGLE Triangular kernel. +a = sqrt(6); +z = abs(z); +f = (z<=a) .* (1 - z/a) / a; + +% ----------------------------- +% The following are functions that define cdfs for smoothing kernels. + +function f = cdf_nl(z) +%CDF_NL Normal kernel, cdf version +f = normcdf(z); + +function f = cdf_ep(z) +%CDF_EP Epanechnikov's asymptotically optimal kernel, cdf version +a = sqrt(5); +z = max(-a, min(z,a)); +f = ((z+a) - (z.^3+a.^3)/15) * 3 / (4*a); + +function f = cdf_bx(z) +%CDF_BX Box-shaped kernel, cdf version +a = sqrt(3); +f = max(0, min(1,(z+a)/(2*a))); + +function f = cdf_tr(z) +%CDF_TR Triangular kernel, cdf version +a = sqrt(6); +denom = 12; % 2*a^2 +f = zeros(size(z)); % -Inf < z < -a +t = (z>-a & z<0); +f(t) = (a + z(t)).^2 / denom; % -a < z < 0 +t = (z>=0 & za); +f(t) = 1; % a < z < Inf diff --git a/matlab/graph/linkage.m b/matlab/graph/linkage.m new file mode 100644 index 0000000..89e9a10 --- /dev/null +++ b/matlab/graph/linkage.m @@ -0,0 +1,117 @@ +function Z = linkage(Y, method) +%LINKAGE Create hierarchical cluster tree. +% Z = LINKAGE(Y) creates a hierarchical cluster tree, using the single +% linkage algorithm. The input Y is a distance matrix such as is +% generated by PDIST. Y may also be a more general dissimilarity +% matrix conforming to the output format of PDIST. +% +% Z = LINKAGE(Y, method) creates a hierarchical cluster tree using +% the specified algorithm. The available methods are: +% +% 'single' --- nearest distance +% 'complete' --- furthest distance +% 'average' --- average distance +% 'centroid' --- center of mass distance (the output Z is meaningful +% only if Y contains Euclidean distances) +% 'ward' --- inner squared distance +% +% Cluster information will be returned in the matrix Z with size m-1 +% by 3, where m is the number of observations in the original data. +% Column 1 and 2 of Z contain cluster indices linked in pairs +% to form a binary tree. The leaf nodes are numbered from 1 to +% m. They are the singleton clusters from which all higher clusters +% are built. Each newly-formed cluster, corresponding to Z(i,:), is +% assigned the index m+i, where m is the total number of initial +% leaves. Z(i,1:2) contains the indices of the two component +% clusters which form cluster m+i. There are n-1 higher clusters +% which correspond to the interior nodes of the output clustering +% tree. Z(i,3) contains the corresponding linkage distances between +% the two clusters which are merged in Z(i,:), e.g. if there are +% total of 30 initial nodes, and at step 12, cluster 5 and cluster 7 +% are combined and their distance at this time is 1.5, then row 12 +% of Z will be (5,7,1.5). The newly formed cluster will have an +% index 12+30=42. If cluster 42 shows up in a latter row, that means +% this newly formed cluster is being combined again into some bigger +% cluster. +% +% The centroid method can produce a cluster tree that is not monotonic. +% This occurs when the distance from the union of two clusters to a third +% cluster is less than the distance from either individual cluster to +% that third cluster. In such a case, sections of the dendrogram change +% direction. This is an indication that another method should be used. +% +% See also PDIST, INCONSISTENT, COPHENET, DENDROGRAM, CLUSTER, CLUSTERDATA, +% KMEANS, SILHOUETTE. + +% Copyright 1993-2002 The MathWorks, Inc. +% $Revision: 1.16 $ + +[k, n] = size(Y); + + +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end + +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end + +method = lower(method(1:2)); % simplify the switch string. + +% a flag for non-monotonic distances in tree. this can only happen with +% the centroid method +monotonic = 1; + +Z = zeros(m-1,3); % allocate the output matrix. + +% during updating clusters, cluster index is constantly changing, R is +% a index vector mapping the original index to the current (row, column) +% index in Y. N denotes how many points are contained in each cluster. + +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; + +for s = 1:(n-1) + X = Y; + + [v, k] = min(X); + + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + + % Update Y. In order to vectorize the computation, we need to compute all + % the indices corresponding to cluster i and j in Y, denoted by I and J. + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end \ No newline at end of file diff --git a/matlab/graph/mutationPlot.m b/matlab/graph/mutationPlot.m new file mode 100644 index 0000000..2a9c15b --- /dev/null +++ b/matlab/graph/mutationPlot.m @@ -0,0 +1,330 @@ +function mutationPlot(tietue) + +global COUNTS; global PARTITION; global SUMCOUNTS; +clearGlobalVars; + +h0 = findobj('Tag','load_menu'); +c = get(h0,'UserData'); +h0 = findobj('Tag','filename1_text'); +filename1 = get(h0,'String'); +disp('---------------------------------------------------'); +disp('Viewing the mutation plot.'); +disp(['Load the mixture result from: ',[filename1],'...']); + +if (~isstruct(tietue)) +% [filename, pathname] = uigetfile('*.mat', 'Load mixture result file'); +% if (filename==0 & pathname==0) return; +% else +% %display('---------------------------------------------------'); +% %display(['Reading mixture result from: ',[pathname filename],'...']); +% end +% pause(0.0001); +% h0 = findobj('Tag','filename1_text'); +% set(h0,'String',filename); clear h0; +% +% struct_array = load([pathname filename]); +% if isfield(struct_array,'c') %Matlab versio +% c = struct_array.c; +% if ~isfield(c,'PARTITION') | ~isfield(c,'rowsFromInd') +% disp('Incorrect file format'); +% return +% end +% else +% disp('Incorrect file format'); +% return; +% end + PARTITION = c.PARTITION; COUNTS = c.COUNTS; SUMCOUNTS = c.SUMCOUNTS; + alleleCodes = c.alleleCodes; adjprior = c.adjprior; popnames = c.popnames; + rowsFromInd = c.rowsFromInd; data = c.data; npops = c.npops; noalle = c.noalle; +else + PARTITION = tietue.PARTITION; + COUNTS = tietue.COUNTS; + SUMCOUNTS = tietue.SUMCOUNTS; + alleleCodes = tietue.alleleCodes; + adjprior = tietue.adjprior; + popnames = tietue.popnames; + rowsFromInd = tietue.rowsFromInd; + data = double(tietue.data); + npops = tietue.npops; + noalle = tietue.noalle; +end + +nloci = size(COUNTS,2); +ninds = size(data,1)/rowsFromInd; + +if isfield(c,'CQ_COUNTS') + % Linked data + if isfield(c,'gene_lengths') + gene_lengths = c.gene_lengths; + else + [filename, pathname] = uigetfile('*.txt', 'Load file with gene lengths.'); + if (filename==0 & pathname==0) + return + end + gene_lengths = load([pathname filename]); + end + + component_mat = zeros(length(gene_lengths), max(gene_lengths)); + cum_length = cumsum(gene_lengths); + component_mat(1,1:gene_lengths(1))=1:gene_lengths(1); + for i = 2:length(gene_lengths) + component_mat(i,1:gene_lengths(i)) = cum_length(i-1)+1:cum_length(i); + end +else + component_mat = 1:nloci; + gene_lengths = nloci; +end + +if isfield(c,'CQ_COUNTS') + answers = inputdlg({'Index of the individual of interest',... + 'BF limit (log)',... + 'Genes to analyze'},... + 'INPUT',[1; 1; 1],... + {' ','2.3',['1:' num2str(length(gene_lengths))]}); + ind = str2num(answers{1}); + BF = str2num(answers{2}); + genes = str2num(answers{3}); + %ind = input('Input individual: '); + %BF = input('Input BF: '); + %genes = input('Input genes to analyze: '); + all_right = check_inputs(ind,BF,genes, ninds, gene_lengths); + if all_right==0 + return + end + n_genes = length(genes); + nameText = ['ind: ' num2str(ind) ', genes: ' num2str(genes)]; +else + answers = inputdlg({'Index of the individual of interest',... + 'BF limit (log)'},'INPUT',[1; 1],{' ','2.3'}); + ind = str2num(answers{1}); + BF = str2num(answers{2}); + %ind = input('Input individual: '); + %BF = input('Input BF: '); + all_right = check_inputs(ind,BF,1, ninds, gene_lengths); + if all_right==0 + return + end + genes = 1; + n_genes = 1; + nameText = ['ind: ' num2str(ind)]; +end + +origin = PARTITION(ind); +varit = giveColors(npops); + +figure('NumberTitle','off','Name',nameText); + +mutationList = repmat(... + struct('gene',[],'site',[],... + 'row',[],'pops',[],'bf_vals',[]), [10 1]); +n_mutations = 0; + +for i = 1:n_genes + % Loop over different genes + gene = genes(i); + + for j = 1:rowsFromInd + % Loop over different "haplotypes" + left = 0.05; + bottom = 1 - (1/n_genes)*i + (1/n_genes)/(rowsFromInd*2+1)*(2*j-1); + width = 0.9; + height = 1/n_genes/(rowsFromInd*2+1); + axes('Position',[left bottom width height]); + set(gca, 'Xlim', [-.5 , gene_lengths(gene)+.5], 'YLim', [0,1], ... + 'XTick', [], 'XTickLabel', [], 'YTick', [], 'YTickLabel', []); + + for k=1:gene_lengths(gene) + % Loop over different sites + site = component_mat(gene,k); + allele = data((ind-1)*rowsFromInd+j, site); + %disp(num2str((ind-1)*rowsFromInd+j)); + counts = squeeze(COUNTS(:,site,:)); + + logml = zeros(npops,1); + if allele>0 + for m=1:npops + % Calculate the predictive probabilities of different + % origins + counts(allele,origin) = counts(allele,origin)-1; + counts(allele,m) = counts(allele,m)+1; + logml(m) = calculateLogml(counts, noalle(site)); + counts(allele,m) = counts(allele,m)-1; + counts(allele,origin) = counts(allele,origin)+1; + end + end + + logml = logml-max(logml); + probs = exp(logml) ./ sum(exp(logml)); + + cumprobs = cumsum(probs); + + if log(max(probs)/probs(origin)) >= BF + h0=patch([k-1 k k k-1], [0 0 cumprobs(1) cumprobs(1)], varit(1,:)); + set(h0,'EdgeColor','none'); + for m=2:npops + h0=patch([k-1 k k k-1], [cumprobs(m-1) cumprobs(m-1) cumprobs(m) cumprobs(m)], varit(m,:)); + set(h0,'EdgeColor','none'); + end + n_mutations = n_mutations+1; + if n_mutations>length(mutationList) + mutationList = [mutationList; repmat(... + struct('gene',[],'site',[],... + 'row',[],'pops',[],'bf_vals',[]), [length(mutationList) 1])]; + end + mutationList(n_mutations).gene = gene; + mutationList(n_mutations).site = k; % The location in gene i! + mutationList(n_mutations).row = j; + aux = log(probs ./ probs(origin)); + mutationList(n_mutations).pops = find(aux>=BF); + mutationList(n_mutations).bf_vals = aux(find(aux>=BF)); + end + end + end +end +mlst_data = isfield(c,'CQ_COUNTS'); +writeResults(mutationList, n_mutations, ind, origin, mlst_data); + + +%-------------------------------------------------------------------- + + +function all_right = check_inputs(ind,BF,genes, ninds, gene_lengths) + +all_right = 0; +if length(ind)~=1 + disp('ERROR: index of one individual must be given.'); + return; +end +if ind<=0 | ind >ninds + disp('ERROR: Index of the given individual is out of range.'); + return +end +if length(BF)~=1 + disp('ERROR: one BF value must be given.'); + return +end +if BF<0 + disp('ERROR: BF must be positive.'); + return +end +if length(genes)<1 | length(genes)>length(gene_lengths) + disp('ERROR: input for the genes was incorrect.'); + return +end +if any(genes<1) | any(genes>length(gene_lengths)) + disp('ERROR: input for the genes was incorrect.'); + return +end +all_right = 1; + +%------------------------------------------------------------------ + +function writeResults(mutationList, n_mutations, ind, home, mlst_data) + +h0 = findobj('Tag','filename2_text'); +outf = get(h0,'String'); clear h0; + +if length(outf)>0 + fid = fopen(outf,'a'); +else + fid = -1; +end + +disp(' '); +disp('--------------------------'); +disp(['Individual: ' num2str(ind) '.']); +disp(['Origin of the individual ' num2str(home) '.']); + +if fid ~= -1 + fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['--------------------------------------------']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Individual: ' num2str(ind) '.']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Origin of the individual ' num2str(home) '.']); fprintf(fid, '\n'); +end + + +if n_mutations==0 + disp('No putative locations for mutations detected'); + if fid ~= -1 + fprintf(fid,'%s \n', 'No putative locations for mutations detected'); fprintf(fid, '\n'); + end + return +end + +disp(' '); +if fid ~= -1 + fprintf(fid,'%s \n', [' ']); fprintf(fid, '\n'); +end + +if mlst_data==1 + disp('Putative locations for mutations:'); + disp('gene, site, possible origins'); + if fid ~= -1 + fprintf(fid,'%s \n', 'Putative locations for mutations:'); fprintf(fid, '\n'); + fprintf(fid,'%s \n', 'gene, site, possible origins'); fprintf(fid, '\n'); + end + for i=1:n_mutations + gene = mutationList(i).gene; + site = mutationList(i).site; + text_line = [num2str(gene) blanks(6-floor(log10(gene))) num2str(site) blanks(7-floor(log10(site)))]; + pops = mutationList(i).pops; + bf_vals = mutationList(i).bf_vals; + for j=1:length(pops) + text_line = [text_line num2str(pops(j)) '(' num2str(bf_vals(j)) ') ']; + end + disp(text_line); + if fid ~= -1 + fprintf(fid,'%s \n', [text_line]); fprintf(fid, '\n'); + end + end +else + disp('Putative locations for mutations:'); + disp('locus, haplotype, possible origins'); + if fid ~= -1 + fprintf(fid,'%s \n', 'Putative locations for mutations:'); fprintf(fid, '\n'); + fprintf(fid,'%s \n', 'locus, haplotype, possible origins'); fprintf(fid, '\n'); + end + for i=1:n_mutations + locus = mutationList(i).site; + row = mutationList(i).row; + text_line = [num2str(locus) blanks(7-floor(log10(locus))) num2str(row) blanks(10-floor(log10(row)))]; + pops = mutationList(i).pops; + bf_vals = mutationList(i).bf_vals; + for j=1:length(pops) + text_line = [text_line num2str(pops(j)) '(' num2str(bf_vals(j)) ') ']; + end + disp(text_line); + if fid ~= -1 + fprintf(fid,'%s \n', [text_line]); fprintf(fid, '\n'); + end + end +end + +if fid ~= -1 + fclose(fid); +end + +%------------------------------------------------------------------ + +function val = calculateLogml(counts, noalle) +% counts corresponds to counts of a SINGLE locus. It is two-dimensional +% with as many columns as there are populations. + +npops = size(counts,2); + +prior = ones(noalle,npops)./noalle; +counts = counts(1:noalle,:); + +val = sum(gammaln(sum(prior,1)),2) ... + - sum(gammaln(sum(counts+prior,1)),2) ... + + sum(sum(gammaln(counts+prior))) ... + - sum(sum(gammaln(prior))); + + +%--------------------------------------------------------------- + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; \ No newline at end of file diff --git a/matlab/graph/phyTreeRead.m b/matlab/graph/phyTreeRead.m new file mode 100644 index 0000000..228e9ba --- /dev/null +++ b/matlab/graph/phyTreeRead.m @@ -0,0 +1,197 @@ +function tr = phyTreeRead(filename) +%PHYTREEREAD reads a NEWICK tree formatted file. +% +% TREE = PHYTREEREAD(FILENAME) reads a NEWICK tree formatted file +% FILENAME, returning the data in the file as a PHYTREE object. FILENAME +% can also be a URL or MATLAB character array that contains the text of a +% NEWICK format file. +% +% The NEWICK tree format is found at: +% http://evolution.genetics.washington.edu/phylip/newicktree.html +% +% Note: This implementation only allows binary trees, non-binary trees +% will be translated into a binary tree with extra branches of length 0. +% +% Example: +% +% tr = phytreeread('pf00002.tree') +% +% See also GETHMMTREE, PHYTREE, PHYTREETOOL, PHYTREEWRITE. + +% Copyright 2003-2004 The MathWorks, Inc. +% $Revision: 1.1.6.10 $ $Author: batserve $ $Date: 2004/04/14 23:57:16 $ + + +if nargin==0 + [filename, pathname] = uigetfile({'*.tree';'*.dnd'},'Select Phylogenetic Tree File'); + if ~filename + disp('Canceled, file not read.'); + tr=[]; + return; + end + filename = [pathname, filename]; +end + +% check input is char +% in a future version we may accept also cells +if ~ischar(filename) + error('Bioinfo:InvalidInput','Input must be a character array') +end + +if size(filename,1)>1 % is padded string ? + strin = cellstr(filename); + strin = [strin{:}]; +elseif (strfind(filename(1:min(10,end)), '://')) % is an url ? + if (~usejava('jvm')) + error('Bioinfo:NoJava','Reading from a URL requires Java.') + end + try + strin = urlread(filename); + catch + error('Bioinfo:CannotReadURL','Cannot read URL "%s".', filename); + end + strin = strread(strin,'%c','delimiter','\n')'; +elseif (exist(filename,'file') || ... + exist(fullfile(cd,filename),'file') ) % is a valid filename ? + strin = textread(filename,'%c','delimiter','\n')'; +else % must be single a string with '\n' + strin = strread(filename,'%c','delimiter','\n')'; +end + +% characterizing the string +numBranches = sum(strin==','); +numLeaves = numBranches + 1; +numLabels = numBranches + numLeaves; + +if (numBranches == 0) + error('Bioinfo:NoCommaInInputString', ... + ['There is not any comma in the data,\ninput string may not '... + 'be in Newick style or is not a valid filename.']) +end + +% find the string features: open and close parentheses and leaves +leafPositions = regexp(strin,'[(,][^(,)]')+1; +parenthesisPositions = regexp(strin,'[()]'); +strFeatures = strin(sort([leafPositions parenthesisPositions])); + +% some consistency checking on the parenthesis +temp = cumsum((strFeatures=='(') - (strFeatures==')')); +if any(temp(1:end-1)<1) || (temp(end)~=0) + error('Bioinfo:InconsistentParentheses','The parentheses structure is inconsistent,\ninput string may not be in Newick style or is not a valid filename.') +end + +dist = zeros(numLabels,1); % allocating space for distances +tree = zeros(numBranches,2); % allocating space for tree pointers +names = cell(numLabels,1); % allocating space for tree labels + +try + +% extract label information for the leaves +leafData = regexp(strin,'[(,][^(,);\[\]]+','match'); +for j=1:numel(leafData) + coi = find(leafData{j}==':',1,'last'); + if isempty(coi) % if no colon no length, the whole label is the name + dist(j) = 0; + names{j} = leafData{j}(2:end); + else % if there is colon, get name and length + dist(j) = strread(leafData{j}(coi+1:end),'%f'); + names{j} = leafData{j}(2:coi-1); + end +end +% uniformizing empty cells, value inside the brackets can never be empty +% because branch names will always be empty +[names{cellfun('isempty',names)}] = deal(''); + +% extract label information for the parenthesis +parenthesisData = regexp(strin,')[^(,);\[\]]*','match'); +parenthesisDist = zeros(numel(parenthesisData),1); +for j=1:numel(parenthesisData) + coi = find(parenthesisData{j}==':',1,'last'); + if isempty(coi) % if no colon no length, the whole label is the name + parenthesisDist(j) = 0; + parenthesisData{j} = parenthesisData{j}(2:end); + else % if there is colon, get name and length + parenthesisDist(j) = strread(parenthesisData{j}(coi+1:end),'%f'); + parenthesisData{j} = parenthesisData{j}(2:coi-1); + end +end +% uniformizing empty cells, value inside brackes may be empty +if any(cellfun('isempty',parenthesisData)) + [parenthesisData{cellfun('isempty',parenthesisData)}] = deal(''); +end + +li = 1; bi = 1; pi = 1; % indexes for leaf, branch and parentheses +queue = zeros(1,2*numLeaves); qp = 0; % setting the queue (worst case size) + +j = 1; + +while j <= numel(strFeatures) + switch strFeatures(j) + case ')' % close parenthesis, pull values from the queue to create + % a new branch and push the new branch # into the queue + lastOpenPar = find(queue(1:qp)==0,1,'last'); + numElemInPar = min(3,qp-lastOpenPar); + switch numElemInPar + case 2 % 99% of the cases, two elements in the parenthesis + bp = bi + numLeaves; + names{bp} = parenthesisData{pi}; % set name + dist(bp) = parenthesisDist(pi); % set length + tree(bi,:) = queue(qp-1:qp); + qp = qp - 2; % writes over the open par mark + queue(qp) = bp; + bi = bi + 1; + pi = pi + 1; + case 3 % find in non-binary trees, create a phantom branch + bp = bi + numLeaves; + names{bp} = ''; % set name + dist(bp) = 0; % set length + tree(bi,:) = queue(qp-1:qp); + qp = qp - 1; % writes over the left element + queue(qp) = bp; + bi = bi + 1; + j = j - 1; %repeat this closing branch to get the rest + case 1 % parenthesis with no meaning (holds one element) + qp = qp - 1; + queue(qp) = queue(qp+1); + pi = pi + 1; + case 0 % an empty parenthesis pair + error('Bioinfo:ParenthesisPairWithNoData', ... + ['Found parenthesis pair with no data,\n', ... + 'input string may not be in Newick style or',... + 'is not a valid filename.']) + end % switch numElemInPar + + case '(' % an open parenthesis marker (0) pushed into the queue + qp = qp + 1; + queue(qp) = 0; + + otherwise % a new leaf pushed into the queue + qp = qp + 1; + queue(qp) = li; + li = li + 1; + end % switch strFeatures + j = j + 1; +end % while j ... + +catch + le = lasterror; + if strcmp(le.identifier,'Bioinfo:ParenthesisPairWithNoData') + rethrow(le) + else + error('Bioinfo:IncorrectString',... + ['An error occurred while trying to interpret the data,\n'... + 'input string may not be in Newick style or is not a '... + 'valid filename.']) + end +end + +% make sure all dists are greater than 0 +dist = max(0,dist); + +if sum(dist) == 0 % there was no distance information so force to an unitary ultrametric tree + tr = phyTree(tree,names); +elseif sum(dist(1:numLeaves)) == 0 % no dist infor for leaves, so force an ultrametric tree + tr = phyTree(tree,dist(numLeaves+1:end),names); +else % put all info into output object + tr = phyTree(tree,dist,names); +end diff --git a/matlab/graph/plotPhytree.m b/matlab/graph/plotPhytree.m new file mode 100644 index 0000000..0fe2192 --- /dev/null +++ b/matlab/graph/plotPhytree.m @@ -0,0 +1,687 @@ +function plotPhytree(tr,varargin) +%PLOTPHYTREE renders a phylogenetic tree. +% +% PLOTPHYTREE(TREE) renders a phylogenetic tree object into a MATLAB figure as a +% phylogram. +% +% plotphytree(...,'ROTATION',value) will orient the phylogenetic tree within +% the figure window. Positive angles cause counterclockwise rotation, +% otherwise clockwise rotation. +% +% plotphytree(...,'FONTSIZE',value) will set the lable font size.A value +% specifying the font size to use for text in units determined by the FontUnits +% property (1 point = 1/72 inch). The default is calculated from the data +% according to the number of nodes. +% +% plotphytree(...,'LINESTYLE',value) will set the color of the tree. The +% default color is blue. +% b blue . point - solid +% g green o circle : dotted +% r red x x-mark -. dashdot +% c cyan + plus -- dashed +% m magenta * star (none) no line +% y yellow s square +% k black d diamond +% v triangle (down) +% ^ triangle (up) +% < triangle (left) +% > triangle (right) +% p pentagram +% h hexagram +% +% plotphytree(...,'FONTCOLOR',value) will set the color of lable. It is a +% vector includes [R, G, B] components. Each with the range of 0 to 1. +% The default setting is [.2 .2 .2]. +% +% +% Example: +% +% tr = phytreeread('pf00002.tree'); +% plotphytree(tr,'ROTATION',-pi/2, 'FONTSIZE', 8, 'LINESTYLE', 'b', 'FONTCOLOR', [0.2 0.2 0.2]); +% +% + +if numel(tr)~=1 + error('plotphytree:NoMultielementArrays',... + 'Phylogenetic tree must be an 1-by-1 object.'); +end + +% set defaults +rotation = 0; +fontsize = 1; +fontsizeset = false; +linestyle = '-b'; +fontcolor = [.2 .2 .2]; + + +if nargin>1 && islogical(varargin{1}) + activeBranches = varargin{1}; + argStart = 2; +else + argStart = 1; +end + +if nargin - argStart > 0 + if rem(nargin - argStart,2) == 1 + error('plotphytree:IncorrectNumberOfArguments',... + 'Incorrect number of arguments to %s.',mfilename); + end + okargs = {'rotation', 'fontsize', 'linestyle', 'fontcolor'}; + + for j = argStart:2:nargin-argStart + pname = varargin{j}; + pval = varargin{j+1}; + k = find(strncmpi(pname,okargs,numel(pname))); + if isempty(k) + error('plotphytree:UnknownParameterName',... + 'Unknown parameter name: %s.',pname); + elseif length(k)>1 + error('plotphytree:AmbiguousParameterName',... + 'Ambiguous parameter name: %s.',pname); + else + switch(k) + case 1 % rotation + if isreal(pval(1)) + rotation = double(pval(1)); + else + error('plotphytree:NotValidType',... + 'ROTATION must be numeric and real'); + end + case 2 % fontsize + if isreal(pval(1)) + fontsize = uint8(pval(1)); + fontsizeset = true; + else + error('plotphytree:NotValidType',... + 'fontsize must be numeric'); + end + case 3 % linecolor + linestyle = pval(1); + case 4 % Fontcolor + fontcolor = [pval(1) pval(2) pval(3)]; + end + end + end +end + +orgtr=tr; +treeinfo = GET(tr); + +%Get tree structure +tr = struct(tr); +tr.numLeaves = treeinfo.NumLeaves; +tr.numNodes = treeinfo.NumNodes; +tr.numBranches = treeinfo.NumBranches; +tr.LeafNames = treeinfo.LeafNames; +tr.BranchNames = treeinfo.BranchNames; +tr.numTree = size(tr.tree, 1); + +% obtain parents for every node +tr.par(tr.tree(:)) = tr.numLeaves + [1:tr.numBranches 1:tr.numBranches]; + +% obtain numbers of leaves for each nodes +tr.numChildrenofNode = ChildCount(tr.par, tr.numLeaves, tr.numNodes); +tr.numChildrenofNode(tr.numChildrenofNode == 0) = 1; + +% find angle of each node +unitDegree = 2 * pi / tr.numLeaves; + +tr.NodeSector = tr.numChildrenofNode * unitDegree; +tr.NodeSector(tr.NodeSector == 0) = unitDegree; +tr.NodeSector(tr.numNodes) = 0; + +tr.NodeAngle = zeros(tr.numNodes, 1); +tr.NodeAngle(tr.tree(tr.numTree, 2)) = tr.NodeSector(tr.tree(tr.numTree, 2)) / 2; +tr.NodeAngle(tr.tree(tr.numTree, 1)) = tr.NodeSector(tr.tree(tr.numTree, 2)) + tr.NodeSector(tr.tree(tr.numTree, 1)) / 2; +for i = tr.numTree - 1 : -1 :1 + tr.NodeAngle(tr.tree(i, 2)) = tr.NodeAngle(tr.par(tr.tree(i, 2))) - tr.NodeSector(tr.par(tr.tree(i, 2))) / 2 + tr.NodeSector(tr.tree(i, 2)) / 2; + tr.NodeAngle(tr.tree(i, 1)) = tr.NodeAngle(tr.par(tr.tree(i, 1))) + tr.NodeSector(tr.par(tr.tree(i, 1))) / 2 - tr.NodeSector(tr.tree(i, 1)) / 2; +end + +% Rotation +tr.NodeAngle = tr.NodeAngle + rotation; + +% find (x, y) coordinates of nodes +tr.d = tr.dist; +tr.x = tr.d .* cos(tr.NodeAngle); +tr.y = tr.d .* sin(tr.NodeAngle); + +for i = tr.numNodes - 1: -1 : 1 + tr.x(i) = tr.x(i) + tr.x(tr.par(i)); + tr.y(i) = tr.y(i) + tr.y(tr.par(i)); +end + +nodeIndex = 1 : tr.numNodes; +X = tr.x([nodeIndex;[tr.par(1:tr.numNodes-1) tr.numNodes]]); +Y = tr.y([nodeIndex;[tr.par(1:tr.numNodes-1) tr.numNodes]]); +tr.txtAngle = cal_textAngle(tr); + +fig = gcf; +% fig = figure('Renderer','ZBuffer'); +h.fig = fig; +h.axes = axes; hold on; +sepUnit = max(tr.x)*[-1/20 21/20]; + +set(h.axes,'XTick',[],'YTick',[]); +set(h.axes,'Position',[.05 .05 .9 .9]) +dispTerminalLabels = false; +axis equal + +h.BranchLines = plot(X,Y,linestyle); + +% resize figure if needed +temp = 10/pi*tr.numLeaves; +correctFigureSize(fig,temp,temp); +fontRatio = max(get(fig,'Position').*[0 0 1 0])/tr.numLeaves; +set(h.axes, 'Fontsize', fontsize); + +% set leaf nodes labels +leafIndex = 1 : tr.numLeaves; +X = tr.x(leafIndex); +Y = tr.y(leafIndex); + +txtangle = tr.txtAngle; + +% % Rotate Label +h.leafNodeLabels = zeros(1, tr.numLeaves); +for i = 1:tr.numLeaves + h.leafNodeLabels(i) = text(X(i),Y(i),tr.names(i), 'Rotation', txtangle(i)); +end + +set(h.leafNodeLabels,'color',fontcolor,'clipping','on') +if fontsizeset + set(h.leafNodeLabels, 'Fontsize', fontsize); +else + set(h.leafNodeLabels,'Fontsize',min(8,ceil(fontRatio*1.2))); +end + +textHeight = mean(cell2mat(get(h.leafNodeLabels,'Extent')))*[0 0 0 1]'; +for ind = 1:numel(h.leafNodeLabels) + if X(ind) - tr.x(tr.par(ind)) < 0 + if txtangle(ind) < 90.0 - 0.001 || txtangle(ind) > 90.0 + 0.001 + set(h.leafNodeLabels(ind),'horizontal','right') + set(h.leafNodeLabels(ind),'Position',get(h.leafNodeLabels(ind),'Position')+[sepUnit(1)*cos(txtangle(ind) * 2 * pi/360) sepUnit(1)*sin(txtangle(ind) * 2 * pi/360) 0]) + else + if Y(ind) > 0 + set(h.leafNodeLabels(ind),'Position',get(h.leafNodeLabels(ind),'Position')-[0 sepUnit(1) 0]) + else + set(h.leafNodeLabels(ind),'Position',get(h.leafNodeLabels(ind),'Position')+[0 sepUnit(1) 0]) + end + end + else + if txtangle(ind) < 90.0 - 0.001 || txtangle(ind) > 90.0 + 0.001 + set(h.leafNodeLabels(ind),'horizontal','left') + set(h.leafNodeLabels(ind),'Position',get(h.leafNodeLabels(ind),'Position')-[sepUnit(1)*cos(txtangle(ind) * 2 * pi/360) sepUnit(1)*sin(txtangle(ind) * 2 * pi/360) 0]) + else + if Y(ind) > 0 + set(h.leafNodeLabels(ind),'Position',get(h.leafNodeLabels(ind),'Position')-[0 sepUnit(1) 0]) + else + set(h.leafNodeLabels(ind),'Position',get(h.leafNodeLabels(ind),'Position')+[0 sepUnit(1) 0]) + end + end + end +end + +dispLeafLabels = 1; +% correct axis limits given the extent of labels +if dispLeafLabels + E = cell2mat(get(h.leafNodeLabels,'Extent')); + if strcmp(get(gca,'XDir'),'reverse') + E(:,1) = E(:,1) - E(:,3); + end + if strcmp(get(gca,'YDir'),'reverse') + E(:,2) = E(:,2) - E(:,4); + end + E=[E;[xlim*[1;0] ylim*[1;0] diff(xlim) diff(ylim)]]; + mins = min(E(:,[1 2])); + maxs = max([sum(E(:,[1 3]),2) sum(E(:,[2 4]),2)]); + axis([mins(1) maxs(1) mins(2) maxs(2)]) +end + +if dispTerminalLabels + set(h.terminalNodeLabels,'Fontsize',min(9,ceil(fontRatio/1.5))); +end + +box off +hold off + +% store handles +set(fig,'UserData',h) +if nargout + handles = h; +end + + +%************************************************************** +% Count number of children for each node +function numChildren = ChildCount(par, numleafs, numnodes) +n = length(par); +numChildren = zeros(1, numnodes); +NoBranchStart = numleafs + 1; + +for i = 1 : numleafs + numChildren(par(i)) = numChildren(par(i)) + 1; +end + +for i = NoBranchStart : n + numChildren(par(i)) = numChildren(par(i)) + numChildren(i); +end +%************************************************************** + +%********************************************************************* +function correctFigureSize(fig,recommendedHeight,recommendedWidth) +% helper function to increase initial figure size depending on the screen & +% tree sizes +screenSize = diff(reshape(get(0,'ScreenSize'),2,2),[],2)-[0;100]; + % 100 gives extra space for the figure header and win toolbar +position = get(fig,'Position'); +if recommendedHeight > position(4) + if recommendedHeight < sum(position([2 4])) + position(2) = sum(position([2 4])) - recommendedHeight; + position(4) = recommendedHeight; + elseif recommendedHeight < screenSize(2) + position(2) = 30; + position(4) = recommendedHeight; + else + position(2) = 30; + position(4) = screenSize(2); + end +end +if recommendedWidth > position(3) + if recommendedWidth < sum(position([1 3])) + position(1) = sum(position([1 3])) - recommendedWidth; + position(3) = recommendedWidth; + elseif recommendedWidth < screenSize(1) + position(1) = 0; + position(3) = recommendedHeight; + else + position(1) = 0; + position(3) = screenSize(1); + end +end +set(fig,'Position',position) +%********************************************************************* + +%********************************************************************* +function [lefttree, righttree, thirdtree] = findsubtreeofnode(tr) + +lefttree = zeros(tr.numBranches, tr.numNodes); +righttree = zeros(tr.numBranches, tr.numNodes); +thirdtree = zeros(tr.numBranches, tr.numNodes); + +% thirdtree_left = zeros(tr.numBranches, tr.numNodes); +% thirdtree_right = zeros(tr.numBranches, tr.numNodes); + +lefttree(1, 1) = tr.tree(1, 1); +righttree(1,1) = tr.tree(1, 2); +lefttree(1, tr.numNodes) = 1; +righttree(1, tr.numNodes) = 1; + +for i = 2 : tr.numBranches + j = 0; + tmp = tr.tree(i, 1); + lefttree(i, 1) = tmp; + lefttree(i, tr.numNodes) = 1; + if tmp > tr.numLeaves + lefttrlen = lefttree(tmp - tr.numLeaves, tr.numNodes); + sub = lefttree(tmp - tr.numLeaves, 1 : lefttrlen); + lefttree(i, 2 : lefttrlen + 2 - 1) = sub; + + righttrlen = righttree(tmp - tr.numLeaves, tr.numNodes); + sub = righttree(tmp - tr.numLeaves, 1 : righttrlen); + lefttree(i, lefttrlen + 3 - 1: lefttrlen + 3 - 1 + righttrlen - 1) = sub; + lefttree(i, tr.numNodes) = lefttrlen + righttrlen + lefttree(i, tr.numNodes); + end + + j = 0; + tmp = tr.tree(i, 2); + righttree(i, 1) = tmp; + righttree(i, tr.numNodes) = 1; + if tmp > tr.numLeaves + lefttrlen = lefttree(tmp - tr.numLeaves, tr.numNodes); + sub = lefttree(tmp - tr.numLeaves, 1 : lefttrlen); + righttree(i, 2 : lefttrlen + 2 - 1) = sub; + + righttrlen = righttree(tmp - tr.numLeaves, tr.numNodes); + sub = righttree(tmp - tr.numLeaves, 1 : righttrlen); + righttree(i, lefttrlen + 3 - 1: lefttrlen + 3 - 1 + righttrlen - 1) = sub; + righttree(i, tr.numNodes) = lefttrlen + righttrlen + righttree(i, tr.numNodes); + end +end + +% Third tree +node = 1 : tr.numLeaves; +for i = 1 : tr.numBranches + tmp = [lefttree(i, 1:lefttree(i, tr.numNodes)), righttree(i, 1:righttree(i, tr.numNodes))]; + tf = ismember(node, tmp); + tmp = node(find(tf == 0)); + tmp = tmp(find(tmp <= tr.numLeaves)); + nlength = length(tmp); + thirdtree(i, 1 : nlength) = tmp; + thirdtree(i, tr.numNodes) = nlength; +end + +%********************************************************************* +function [treeangle, leftnode, rightnode] = caltreeangle(treenode, root, tr) +tmp = find(treenode <= tr.numLeaves); +leftnode = treenode(tmp(1)); +rightnode = treenode(tmp(length(tmp))); + +x0 = tr.x(root); +y0 = tr.y(root); +x1 = tr.x(leftnode); +y1 = tr.y(leftnode); +x2 = tr.x(rightnode); +y2 = tr.y(rightnode); + +treeangle = cal_angle(x0, y0, x1, y1, x2, y2); + +%********************************************************************* +function [subtree1, subtree2] = findsubtree(node, tr) +subtree1 = []; +subtree2 = []; + +nodeindex = 1 : tr.numNodes; + +i1 = 0; +i2 = 0; + +flag1 = 0; +flag2 = 0; + +while ~flag1 || ~flag2 + nodetmp = find(tr.par == node); + + if nodetmp(1) <= tr.numLeaves + subtree1 = [subtree1, nodetmp(1)]; + i1 = i1 + 1; + flag1 = 1; + else + if i1 == 0 + subtree1 = [subtree1, nodetmp(1)]; + end + [subtr1, subtr2] = findsubtree(nodetmp(1), tr); + flag1 = 1; + subtree1 = [subtree1, subtr1, subtr2]; + end + + if nodetmp(2) <= tr.numLeaves + subtree2 = [subtree2, nodetmp(2)]; + i2 = i2 + 1; + flag2 = 1; + else + if i2 == 0 + subtree2 = [subtree2, nodetmp(2)]; + end + [subtr1, subtr2] = findsubtree(nodetmp(2), tr); + flag2 = 1; + subtree2 = [subtree2, subtr1, subtr2]; + end +end + +%********************************************************************* +function leavenodes = findanothersubtree(node, tr) +nextnode = node; +cutpoint = tr.par(length(tr.par)); +subtree = []; +tmpnode = nextnode; +nextnode = tr.par(nextnode); + +while nextnode <= cutpoint + [subtree1, subtree2] = findsubtree(nextnode, tr); + if sum(find(subtree1 == node)) == 0 + subtree = [subtree, subtree1]; + else + subtree = [subtree, subtree2]; + end + tmpnode = nextnode; + if nextnode < cutpoint + nextnode = tr.par(nextnode); + else + break; + end +end +leavenodes = subtree(find(subtree <= tr.numLeaves)); + +%********************************************************************* +function slope = cal_slope(x1, y1, x2, y2) +if x1 == x2 + slope = tan(pi/2); +else + slope = (y2 - y1)/(x2 - x1); +end + +%********************************************************************* +function xangle = cal_angle(x0, y0, x1, y1, x2, y2) +k1 = cal_slope(x0, y0, x1, y1); +k2 = cal_slope(x0, y0, x2, y2); +xangle = atan(abs((k2 - k1)/(1 + k1 * k2))); + +%********************************************************************* +function [leftnode, rightnode, xangle] = cal_thr_angle(trnode, node, tr) +leftnode = 0; +rightnode = 0; +xangle = 0; + +n = length(trnode); +if n == 1 + leftnode = trnode(1); + rightnode = trnode(1); + xangle = 0; + return; +end + +x0 = tr.x(node); +y0 = tr.y(node); + +for i = 1 : n - 1 + x1 = tr.x(trnode(i)); + y1 = tr.y(trnode(i)); + k1 = cal_slope(x0, y0, x1, y1); + for j = i + 1 : n + x2 = tr.x(trnode(j)); + y2 = tr.y(trnode(j)); + k2 = cal_slope(x0, y0, x2, y2); + theta = atan(abs((k2 - k1)/(1 + k1 * k2))); + if xangle < theta + xangle = theta; + leftnode = trnode(i); + rightnode = trnode(j); + end + end +end + +%********************************************************************* +function [left, right] = leftnrightnode(subtree, tr) +lefttree = tr.tree(:, 1); +righttree = tr.tree(:, 2); + +left = 0; +right = 0; +numindex = length(lefttree); +i = 1; +for i = 1 : numindex + [tf, index] = ismember(lefttree(i), subtree); + if tf == 1 + left = lefttree(i); + break; + end +end + +for i = 1 : numindex + [tf, index] = ismember(righttree(i), subtree); + if tf == 1 + right = righttree(i); + break; + end +end + +if left == 0 + left = right; +end +if right == 0 + right = left; +end + +%********************************************************************* +function nodeangle = cal_nodeangle(y, x) +nodeangle = atan2(y, x); +if nodeangle < 0 + nodeangle = 2*pi + nodeangle; +end + +%********************************************************************* +function tree = adjustsubtreeangle(subtr1, subtr2, subtr3, orgnode, tr) + +tr1node = subtr1(find(subtr1 <= tr.numLeaves)); +tr2node = subtr2(find(subtr2 <= tr.numLeaves)); + +tr.x(subtr1) = tr.x(subtr1) - tr.x(orgnode); +tr.y(subtr1) = tr.y(subtr1) - tr.y(orgnode); + +tr.x(subtr2) = tr.x(subtr2) - tr.x(orgnode); +tr.y(subtr2) = tr.y(subtr2) - tr.y(orgnode); + +tr.x(subtr3) = tr.x(subtr3) - tr.x(orgnode); +tr.y(subtr3) = tr.y(subtr3) - tr.y(orgnode); + +[tree1left, tree1right] = leftnrightnode(tr1node, tr); +[tree2left, tree2right] = leftnrightnode(tr2node, tr); +tr3node = subtr3;%leftnrightnode(subtr3, tr); +tree3left = tr3node(1); +tree3right = tr3node(2); + +a1left = cal_nodeangle(tr.y(tree1left), tr.x(tree1left)); +a1right = cal_nodeangle(tr.y(tree1right), tr.x(tree1right)); + +a2left = cal_nodeangle(tr.y(tree2left), tr.x(tree2left)); +a2right = cal_nodeangle(tr.y(tree2right), tr.x(tree2right)); + +a3left = cal_nodeangle(tr.y(tree3left), tr.x(tree3left)); +a3right = cal_nodeangle(tr.y(tree3right), tr.x(tree3right)); + +daylight = [a1right - a3left, a2right - a1left, a3right - a2left]; +equaldaylight = sum(abs(daylight))/3; + +adjust1n3 = equaldaylight - daylight(1); +adjust2n3 = daylight(3) - equaldaylight; + +% Rotation --- subtr1 +a = tr.x(subtr1); +b = tr.y(subtr1); +if adjust1n3 ~= 0 + tr.x(subtr1) = a * cos(adjust1n3) - b * sin(adjust1n3); + tr.y(subtr1) = a * sin(adjust1n3) + b * cos(adjust1n3); +end + +% Rotation --- subtr2 +a = tr.x(subtr2); +b = tr.y(subtr2); +if adjust2n3 ~= 0 + tr.x(subtr2) = a * cos(adjust2n3) - b * sin(adjust2n3); + tr.y(subtr2) = a * sin(adjust2n3) + b * cos(adjust2n3); +end + +tr.x(subtr1) = tr.x(subtr1) + tr.x(orgnode); +tr.y(subtr1) = tr.y(subtr1) + tr.y(orgnode); + +tr.x(subtr2) = tr.x(subtr2) + tr.x(orgnode); +tr.y(subtr2) = tr.y(subtr2) + tr.y(orgnode); + +tr.x(subtr3) = tr.x(subtr3) + tr.x(orgnode); +tr.y(subtr3) = tr.y(subtr3) + tr.y(orgnode); + +tree = tr; + +%********************************************************************* +function tree = adjusttree(lefttree, righttree, thirdtree, node, tr) + +tr1 = lefttree(node - tr.numLeaves, 1 : lefttree(node - tr.numLeaves, tr.numNodes)); +tr2 = righttree(node - tr.numLeaves, 1 : righttree(node - tr.numLeaves, tr.numNodes)); +tr3 = thirdtree(node - tr.numLeaves, 1 : thirdtree(node - tr.numLeaves, tr.numNodes)); +[lefttreeangle, l_leftnode, l_rightnode] = caltreeangle(tr1, node, tr); +[righttreeangle, r_leftnode, r_rightnode] = caltreeangle(tr2, node, tr); +[thr_leftnode, thr_rightnode, xangle] = cal_thr_angle(tr3, node, tr); + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +d1 = sqrt((tr.x(l_rightnode) - tr.x(thr_leftnode)) * (tr.x(l_rightnode) - tr.x(thr_leftnode)) + (tr.y(l_rightnode) -tr.y(thr_leftnode)) * (tr.y(l_rightnode) -tr.y(thr_leftnode))); +d2 = sqrt((tr.x(l_rightnode) - tr.x(thr_rightnode)) * (tr.x(l_rightnode) - tr.x(thr_rightnode)) + (tr.y(l_rightnode) -tr.y(thr_rightnode)) * (tr.y(l_rightnode) -tr.y(thr_rightnode))); + +if d2 > d1 + tmp = thr_leftnode; + thr_leftnode = thr_rightnode; + thr_rightnode = tmp; +end + +% daylight = 2 * pi - lefttreeangle - righttreeangle - xangle; +% equaldaylight = daylight / 3; + +[angle1to3, angle2to3, angle1to2] = cal_daylight(l_leftnode, l_rightnode, r_leftnode, r_rightnode, thr_leftnode, thr_rightnode, node, tr); + +daylight = angle1to3 + angle2to3 + angle1to2; +equaldaylight = daylight / 3; + +adjust1n3 = equaldaylight - angle1to3; +adjust2n3 = angle2to3 - equaldaylight; + +tr.x(tr1) = tr.x(tr1) - tr.x(node); +tr.y(tr1) = tr.y(tr1) - tr.y(node); + +tr.x(tr2) = tr.x(tr2) - tr.x(node); +tr.y(tr2) = tr.y(tr2) - tr.y(node); + +% Rotation --- subtr1 +a = tr.x(tr1); +b = tr.y(tr1); +if adjust1n3 ~= 0 + tr.x(tr1) = a * cos(adjust1n3) - b * sin(adjust1n3); + tr.y(tr1) = a * sin(adjust1n3) + b * cos(adjust1n3); +end + +% Rotation --- subtr2 +a = tr.x(tr2); +b = tr.y(tr2); +if adjust2n3 ~= 0 + tr.x(tr2) = a * cos(adjust2n3) - b * sin(adjust2n3); + tr.y(tr2) = a * sin(adjust2n3) + b * cos(adjust2n3); +end + +tr.x(tr1) = tr.x(tr1) + tr.x(node); +tr.y(tr1) = tr.y(tr1) + tr.y(node); + +tr.x(tr2) = tr.x(tr2) + tr.x(node); +tr.y(tr2) = tr.y(tr2) + tr.y(node); + +tree = tr; + +%********************************************************************* +function [angle1to3, angle2to3, angle1to2] = cal_daylight(l_leftnode, l_rightnode, r_leftnode, r_rightnode, thr_leftnode, thr_rightnode, node, tr) + +x0 = tr.x(node); +y0 = tr.y(node); + +llx = tr.x(l_leftnode); +lly = tr.y(l_leftnode); +lrx = tr.x(l_rightnode); +lry = tr.y(l_rightnode); +rlx = tr.x(r_leftnode); +rly = tr.y(r_leftnode); +rrx = tr.x(r_rightnode); +rry = tr.y(r_rightnode); + +tlx = tr.x(thr_leftnode); +tly = tr.y(thr_leftnode); +trx = tr.x(thr_rightnode); +tryy = tr.y(thr_rightnode); + +angle1to3 = cal_angle(x0, y0, llx, lly, trx, tryy); +angle2to3 = cal_angle(x0, y0, rrx, rry, tlx, tly); +angle1to2 = cal_angle(x0, y0, lrx, lry, rlx, rly); + +%********************************************************************* +function txtAngle = cal_textAngle(tr) +nodeIndex = 1 : tr.numLeaves; +X = tr.x(nodeIndex) - tr.x(tr.par(nodeIndex)); +Y = tr.y(nodeIndex) - tr.y(tr.par(nodeIndex)); + +txtAngle = atan(Y./X) * 360/(2*pi); \ No newline at end of file diff --git a/matlab/graph/plotVarmuus.m b/matlab/graph/plotVarmuus.m new file mode 100644 index 0000000..072bcb5 --- /dev/null +++ b/matlab/graph/plotVarmuus.m @@ -0,0 +1,102 @@ +function plotVarmuus(V, C, pointers, varmuus, coordinates, partition, tekstit) + +if nargin < 7 + tekstit = pointers; +end + +notEmptyPops = length(unique(partition)); +if notEmptyPops>30 + disp(['Number of populations: ' num2str(notEmptyPops)]); + disp(' '); + disp('Figure can be drawn only if the number of populations'); + disp('is less or equal to 30.'); + disp(' '); + return; +end + + +h1 = figure; +hold on + +colors=giveColors(notEmptyPops); + +[I, J] = find(coordinates>0 | coordinates<0); +I=unique(I); +xmin = min(coordinates(I,1)); +xmax = max(coordinates(I,1)); +xdiff = (xmax-xmin); +xmean = xmin + xdiff/2; + +ymin = min(coordinates(I,2)); +ymax = max(coordinates(I,2)); +ydiff = (ymax-ymin); +ymean = ymin + ydiff/2; + +pituus = max(ydiff,xdiff)*1.1/2; + +zmax = 0.8*max(varmuus) + 0.2; + +axis([xmean-pituus xmean+pituus ymean-pituus ymean+pituus 0 zmax]); +grid(gca); +d = [1 2 3 1]; + + + +for i=1:length(C) + koko = length(C{i}); + soluPisteet = V(C{i},:); + center = [mean(soluPisteet(:,1)) mean(soluPisteet(:,2))]; + center = repmat(center, [koko 1]); + soluPisteet = soluPisteet + (center - soluPisteet)./1000; + + + apu = zeros(2*koko, 3); + apu(1:koko, 1:2) = soluPisteet; + apu(koko+1:end, 1:2) = soluPisteet; + apu(koko+1:end, 3) = varmuus(i); + + taulu = pointers{i}; + if length(taulu)>0 + color = colors(partition(taulu(1)),:); + pisteet =[1:koko 1]; + patch('XData', apu(pisteet,1), 'YData', apu(pisteet,2), ... + 'ZData', apu(pisteet,3), 'FaceColor',color, 'Clipping', ... + 'on', 'EdgeColor','k', 'LineWidth', 1); + + pisteet = pisteet+koko; + patch('XData', apu(pisteet,1), 'YData', apu(pisteet,2), ... + 'ZData', apu(pisteet,3), 'FaceColor',color, 'Clipping', ... + 'on', 'EdgeColor','k', 'LineWidth', 1); + + for j = 1:koko-1 + pisteet = [j j+1 j+koko+1 j+koko j]; + patch('XData', apu(pisteet,1), 'YData', apu(pisteet,2), ... + 'ZData', apu(pisteet,3), 'FaceColor',color, 'Clipping', ... + 'on', 'EdgeColor','k', 'LineWidth', 1); + end + + pisteet = [koko 1 koko+1 2*koko koko]; + patch('XData', apu(pisteet,1), 'YData', apu(pisteet,2), ... + 'ZData', apu(pisteet,3), 'FaceColor',color, 'Clipping', ... + 'on', 'EdgeColor','k', 'LineWidth', 1); + end +end + +if ~isequal(tekstit, -1) + for i=1:length(pointers) + taulu = pointers{i}; + teksti = tekstit{i}; + if isnumeric(teksti) + teksti = num2str(teksti); + end + if length(taulu)>0 + text(coordinates(taulu(1),1),coordinates(taulu(1),2), ... + varmuus(i) + zmax/100, teksti, 'FontSize', 10); + end + end +end + +view(3); +hold off + + diff --git a/matlab/graph/plotflow.m b/matlab/graph/plotflow.m new file mode 100644 index 0000000..119b653 --- /dev/null +++ b/matlab/graph/plotflow.m @@ -0,0 +1,282 @@ +function plotflow(action) +switch action + case 'rename' + rename; + case 'prune' + prune; + case 'edit_pop_name' + changePopNumber(0); + case 'cancel_pop_name' + closereq; + case 'load_pop_names' + loadPopNames; + case 'ok_pop_name' + okPopName; + case 'five_back' + changePopNumber(-5); + case 'one_back' + changePopNumber(-1); + case 'one_ahead' + changePopNumber(1); + case 'five_ahead' + changePopNumber(5); + case 'help' + openHelp; +end +return + +% ------------------------------------------------------------------------- +function rename +h0 = gcf; +h1 = findobj(h0, 'Tag','attr_menu'); +g = get(h1,'Userdata'); +varnames = g.varnames; +openInputPopNamesFigure(varnames); % rename clusters + +% ------------------------------------------------------------------------- +function prune +h0 = gcf; +h1 = findobj(h0, 'Tag','attr_menu'); +g = get(h1,'Userdata'); +adjmat = g.adjmat; +if isfield(g,'adjmat2') + adjmat2 = g.adjmat2; + min_strength = min(adjmat2(logical(adjmat2>0))); +else + min_strength = min(adjmat(logical(adjmat>0))); +end + +%waitALittle; +answer = inputdlg( ['Specify the minimal strength of the gene flow graph such that'... + ' edges with lower weight will be pruned'],... + 'Prune the graph',1, {num2str(min_strength)}); +if isempty(answer) % cancel has been pressed + return +else + min_strength = str2num(answer{1}); + fprintf('Minimal flow strength: %4.5f\n', min_strength); + adjmat(adjmat num_of_pops + disable('five_ahead_button'); +else enable('five_ahead_button'); +end; +if (num_of_curr + 1) > num_of_pops + disable('one_ahead_button'); +else enable('one_ahead_button'); +end; + + +function loadPopNames +%Loads the population names to UserData of 'population_ +%names_figure'. Gets the names from a file specified by +%the user. + +[filename,pathname] = uigetfile('*.txt','Load Population Names'); +if (filename == 0) & (pathname == 0) + %Cancel was pushed. + return, +end; +input_file = [pathname filename]; +%Read population names from the file to a variable 'names': +fid = fopen(input_file); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) && ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); +%Check that the number of lines is the same as npops: +h0 = findobj('Tag','data_info_button'); +tiedot = get(h0,'UserData'); +counts = tiedot.countsUD; +sizc = size(counts); npops = sizc(3); +length_names = length(names); +if npops ~= length_names + msgbox(['Loading of names was unsuccessful.' ... + 'The number of lines in a file that contains the names ' ... + 'must be same as the number of observed ' ... + 'sampling units in the data.'] ,'Error', ... + 'error'); + return; +end; +%Save names to the UserData of the 'population_names_figure': +h0 = findobj('Tag','population_names_figure'); +tiedot = get(h0,'UserData'); +tiedot.tempnamesUD = names; +set(h0,'UserData',tiedot); +%Update text-field to initial state: +h0 = findobj('Tag','input_pop_name_text'); +set(h0,'String','Name of Cluster 1:'); +%Set the name of the first population to the screen: +h0 = findobj('Tag','pop_name_edit'); +set(h0,'String',names{1}); + + +function okPopName +%Saves the names of populations and closes +%'population_names_figure'. + +h0 = findobj('Tag','population_names_figure'); +tiedot = get(h0,'UserData'); +names = tiedot.tempnamesUD; +%Close the figure: +% closereq; +close(h0); + +handle = gcf; +h0 = findobj(handle,'Tag','attr_menu'); +g = get(h0,'Userdata'); +g.varnames = names; +set(h0,'UserData',g); + +% redraw the figure +switch g.type + case 'GENEFLOW' + graphvizpath = g.graphvizpath; + nodecolors = g.nodecolors; + nodestyles = g.nodestyles; + arccolors = g.arccolors; + arcstyles = g.arcstyles; + groupnames = g.varnames; + + d = cd; + plotmodel(g.adjmat2,g.k,'graphvizpath', graphvizpath, ... + 'nodecolors', nodecolors, ... + 'nodestyles', nodestyles, ... + 'arccolors',arccolors, ... + 'arcstyles',arcstyles, ... + 'varnames', groupnames, ... + 'target', ['matlab:',num2str(handle)]); + cd(d); + case {'nj', 'upgma'} + switch g.visualtype + case {'square', 'angular'} + viewDendrogram(g.visualtype) + case {'radial', 'phylogram'} + viewUnrooted(g.visualtype) + end + otherwise + return +end + +function enable(obj_tag) +h0 = findobj('Tag',obj_tag); +set(h0,'Enable','on'); + + +function disable(obj_tag) +h0 = findobj('Tag',obj_tag); +set(h0,'Enable','off'); + + +function openHelp +info{1}='Gene Flow Between Populations'; +info{2}=''; +info{3}='Produced using GraphViz'; +info{4}='Source: gene flow matrix produced by BAPS'; +info{5}=''; +info{6}='Author: Jing Tang'; +info{7}=''; +helpdlg(info,'Help'); diff --git a/matlab/graph/plotmodel.m b/matlab/graph/plotmodel.m new file mode 100644 index 0000000..c76b0dc --- /dev/null +++ b/matlab/graph/plotmodel.m @@ -0,0 +1,342 @@ +function handle = plotmodel(B, k, varargin) +% PLOTMODEL Plot a causal model. +% +% Syntax: plotmodel(B, k, ...) +% +% Plots a causal model as a directed acyclic graph. The graph +% layout is done with Graphviz, see www.graphviz.org for details. +% You need either Graphviz installed in your system or a Java +% interpreter with internet connection. +% +% Intended usage is to first call LiNGAM to do the causal discovery +% and then to call 'plotmodel' to visualize the estimated model. +% +% Example: +% [B stde ci k] = lingam(data); +% plotmodel(B, k); +% +% Also latent models can be plotted with dashed nodes and incoming/ +% outgoing arcs. Use 'latent' argument with logical column array +% indicating the latent variables (in causal order). +% +% Example: +% latent = logical([0 1 0 0]'); +% plotmodel(B, k, 'latent', latent, 'target', 'psviewer'); +% +% To plot on a specific Matlab figure, use the 'target' argument +% with value 'matlab:[handle]', e.g. 'matlab:3' to plot on figure +% no. 3. +% +% Required input arguments: +% B - the weight matrix. +% k - the causal order of variables. +% +% Optional input arguments in name, value pairs [default]: +% 'target' - target of output ['matlab' (Octave: 'psviewer')]: +% 'matlab' - plot graph on Matlab figure window +% 'java' - plot graph with Grappa. Uses a remote layout server +% if 'dot' can't be found. In that case, needs an +% internet connection. +% 'psviewer' - plot graph with a PS viewer. Default for Octave. +% Edit file 'settings.m' to set your PS viewer. +% '[filename]' - plot graph on eps-file '[filename]' +% 'varnames' - names of variables in cell array of strings [{'x1' 'x2' ...}] +% 'plotarcs' - plot labels for arcs (boolean) [true] +% 'layout' - layout of the graph ['td']: +% 'td' - plot nodes in hierarhial layers, top-down +% 'lr' - plot nodes in hierarhial layers, left to right +% 'circle' - plot nodes in cirle (not possible with +% 'target', 'java') +% 'nodeshapes' - cell array of node shapes ['ellipse'] +% 'nodestyles' - cell array of node styles ['solid'] +% 'arcstyles' - cell array of arc styles ['solid'] +% 'arccolors' - cell array of arc colors ['black'] +% 'arclabels' - cell array of arc labels ['B(j, i)'] +% 'latent' - a logical column vector of latent variables [[]] +% + +% Check if we are running Octave +isoctave = exist('OCTAVE_VERSION'); + +% Check number of input arguments +if nargin < 2 || rem(nargin, 2) ~= 0 + error('number of input arguments must be >= 2 and even'); +end + +% Initialize variables +dims = size(B, 1); +adj = B(k, k)'; + +if isoctave + target = 'psviewer'; +else + target = 'matlab'; +end +handle = []; % Matlab figure handle, [] = create a new figure + +varnames = cell([dims 1]); +for i = 1:dims + varnames{i} = strcat('x', int2str(k(i))); +end + +plotarcs = true; +layout = 'td'; +nodeshapes = cell(dims, 1); +nodeshapes(:) = {'ellipse'}; +nodestyles = cell(dims, 1); +nodestyles(:) = {'solid'}; +adjmat = adj ~= 0; +arcstyles = cell(dims); +arcstyles(adjmat) = {'solid'}; +arccolors = cell(dims); +arccolors(adjmat) = {'black'}; +latent = []; + +% Handle optional input arguments +for i = 1:2:(nargin - 2) + switch varargin{i} + case 'target', [target handle] = parsetarget(varargin{i + 1}); + case 'plotarcs', plotarcs = varargin{i + 1}; + case 'layout', layout = varargin{i + 1}; + case 'nodeshapes', nodeshapes = varargin{i + 1}; + case 'nodestyles', nodestyles = varargin{i + 1}; + case 'nodecolors', nodecolors = varargin{i + 1}; + case 'arcstyles', arcstyles = varargin{i + 1}; + case 'arccolors', arccolors = varargin{i + 1}; + case 'arclabels', arclabels = varargin{i + 1}; + case 'latent', latent = varargin{i + 1}; + case 'varnames' + tempnames = varargin{i + 1}; + for i = 1:dims + varnames{i} = tempnames{k(i)}; + end + case 'graphvizpath', graphvizpath = varargin{i+1}; + otherwise, warning('unknown input argument: %s\n', varargin{i}); + end +end + +% If the latent variables are given, force the drawing attributes +% of nodes and incoming/outgoing arcs to dashed +nodestyles(latent) = {'dashed'}; +latentindices = find(latent)'; +for i = latentindices + in = adjmat(:, i); + out = adjmat(i, :); + arcstyles(i, out) = {'dashed'}; + arcstyles(in, i) = {'dashed'}; +end + +% Set filenames +dotfile = 'plotmodel_temp.dot'; +switch target + case 'matlab' + if isoctave + format = 'ps'; + imgfile = 'plotmodel_temp.ps'; + else + format = 'png'; + imgfile = 'plotmodel_temp.png'; + end + case 'java' + % no need for image file + case 'psviewer' + format = 'ps'; + imgfile = newtempfile('ps'); + settings % find out the PS viewer + otherwise + format = 'ps'; + imgfile = target; +end + +% Generate labels for arcs +if ~exist('arclabels') + if plotarcs + arclabels = double2labels(adj); + else + arclabels = double2labels(zeros(dims)); + end +end + +% Call graph2dot to write the graph to a dot-file +graph2dot(adj, dotfile, ... + 'nodelabels', varnames, ... + 'nodeshapes', nodeshapes, ... + 'nodestyles', nodestyles, ... + 'nodecolors', nodecolors, ... + 'arclabels', arclabels, ... + 'arcstyles', arcstyles, ... + 'arccolors', arccolors, ... + 'arclabels', arclabels, ... + 'leftright', strcmp(layout, 'lr'),... + 'graphvizpath', graphvizpath); + +if strcmp(target, 'java') + % Call Grappa to produce the picture + libstr = '-cp ../lib/grappa1_2.jar:../lib/lingam.jar'; + classstr = ' lingam.GraphPlotter '; + if isoctave + progcall = strcat('java ', libstr, classstr, dotfile); + else + progcall = strcat(['java ', libstr, classstr, dotfile]); + end +else + % Call Graphviz to produce the picture + if strcmp(layout, 'circle') + prog = 'circo'; + else + prog = 'dot'; + end + + if isoctave + progcall = strcat(prog, ' -T', format, ' ', dotfile, ' -o ', imgfile); + else + progcall = strcat([prog, ' -T', format, ' ', dotfile, ... + ' -o ', imgfile]); + end +end + +% Call Graphviz/Grappa +if ispc + shell = 'dos'; +else + shell = 'unix'; +end + +cd(graphvizpath); % change the directory to use dot.exe +shellcall = strcat(shell, '(''', progcall, ''')'); +[status msg] = eval(shellcall); + +if status + if strcmp(target, 'java') + error('calling Java caused an error: %s\n', msg); + else + error('calling "%s" caused an error: %s\n', prog, msg); + end +end + +switch target + case 'matlab' + % Read the image file + if isoctave + options = '-antialias'; + [img cmap] = imread(imgfile, options); + else + [img cmap] = imread(imgfile, 'png'); + end + + % Create a new window / set the current + if isempty(handle) + handle = figure; % Create a new figure + figx = 100; + figy = 100; + else + figure(handle); % Draw on the given figure + if ~isoctave + position = get(handle, 'Position'); + figx = position(1); + figy = position(2); + end + end + + % Resize the figure window + if ~isoctave + set(handle, 'Position', [figx figy size(img, 2) size(img, 1)]); + end + + % Show the image and set the color map + image(img); + if isoctave + toolow = find(cmap < 0.0); + toohigh = find(cmap > 1.0); + if any(toolow) || any(toohigh) + fprintf('Color map values are out of range [0 1]!\n'); + else + colormap(cmap); + end + else + colormap(cmap); + set(gca, 'Position', [0 0 1 1]); + set(gca, 'XTick', []); + set(gca, 'YTick', []); + end + % delete(imgfile); + case 'java' + % nothing to do + case 'psviewer' + fprintf('graph plotted to file "%s"\n', imgfile); + if isoctave + progcall = strcat(PSVIEWER, ' ', imgfile, ' &'); + else + progcall = strcat([PSVIEWER, ' ', imgfile, ' &']); + end + shellcall = strcat(shell, '(''', progcall, ''')'); + [status msg] = eval(shellcall); + if status + error('calling "%s" caused an error: %s\n', prog, msg); + end + otherwise + fprintf('graph plotted to file "%s"\n', imgfile); +end + +% added at 25.5.2009 for saving the raw dot and png from graphviz. +save_preproc = questdlg('Do you wish to save the graph?',... + 'Save graph in png and dot format?',... + 'Yes','No','Yes'); +if isequal(save_preproc,'Yes') + waitALittle; + [filename, pathname] = uiputfile('*.png','Save the graph as'); + if filename~=0 + [path,name,suffix,version] = fileparts(filename); + copyfile(dotfile, [pathname '\' name '.dot']); + copyfile(imgfile, [pathname '\' filename]); + end + +end +% h1 = findobj('Tag','filename1_text'); +% [pathstr, name] = fileparts(get(h1,'string')); + +delete(dotfile); +delete(imgfile); +figure(handle); + +% ----------------------------------------------------------------------------- +function name = newtempfile(suffix) +% NEWTEMPFILE Create a new temporary file. + +prefix = 'plotmodel_temp_'; % common suffix for all temp files +maxnumber = 0; + +files = dir('.'); +for i = 1:length(files) + filename = files(i).name; + + dotindex = find(filename == '.'); % works for Matlab and Octave + if isempty(dotindex) + continue; + end + + suff = filename(dotindex + 1:length(filename)); + if strcmp(suff, suffix) & findstr(filename, 'plotmodel_temp_') == 1 + number = str2num(filename(16:dotindex - 1)); + if number > maxnumber + maxnumber = number; + end + end +end + +name = strcat('plotmodel_temp_', num2str(maxnumber + 1), '.', suffix); + + +% ----------------------------------------------------------------------------- +function [target, handle] = parsetarget(s) +% PARSETARGET Parse target and a possible figure handle. + +colonind = find(s == ':'); % works for Matlab and Octave +if isempty(colonind) + target = s; + handle = []; +else + colonind = colonind(1); + target = s(1:colonind - 1); + handle = str2num(s(colonind + 1:length(s))); +end diff --git a/matlab/graph/population_names_figure.m b/matlab/graph/population_names_figure.m new file mode 100644 index 0000000..2f508bd --- /dev/null +++ b/matlab/graph/population_names_figure.m @@ -0,0 +1,100 @@ +function fig = population_names_figure() +% This is the machine-generated representation of a Handle Graphics object +% and its children. Note that handle values may change when these objects +% are re-created. This may cause problems with any callbacks written to +% depend on the value of the handle at the time the object was saved. +% +% To reopen this object, just type the name of the M-file at the MATLAB +% prompt. The M-file and its associated MAT-file must be on your path. + +% load population_names_figure +waitALittle; +load baps4 +h0 = figure('Color',[0.8 0.8 0.8], ... + 'Colormap',mat0, ... + 'CloseRequestFcn','plotflow cancel_pop_name', ... + 'MenuBar','none', ... + 'Name','Cluster Names', ... + 'NumberTitle','off', ... + 'Position',[533 448 365 197], ... + 'Resize','off', ... + 'Tag','population_names_figure'); +h1 = uimenu('Parent',h0, ... + 'Label','Files', ... + 'Tag','input_pop_name_file_menu'); +h2 = uimenu('Parent',h1, ... + 'Callback','plotflow load_pop_names', ... + 'Label','Load Names', ... + 'Tag','load_pop_names_menu'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'Callback','plotflow cancel_pop_name', ... + 'ListboxTop',0, ... + 'Position',[122.25 12 59 19], ... + 'String','Cancel', ... + 'Tag','cancel_pop_name_button'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'Callback','plotflow ok_pop_name', ... + 'ListboxTop',0, ... + 'Position',[192.75 12 59 19], ... + 'String','Ok', ... + 'Tag','ok_pop_name_button'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'BackgroundColor',[0.753 0.753 0.753], ... + 'ListboxTop',0, ... + 'Position',[18.75 47.25 233.25 82.5], ... + 'Style','frame', ... + 'Tag','input_pop_name_frame'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'BackgroundColor',[0.753 0.753 0.753], ... + 'HorizontalAlignment','left', ... + 'ListboxTop',0, ... + 'Position',[30.75 107.25 203.25 13.5], ... + 'String','Name of Cluster 1:', ... + 'Style','text', ... + 'Tag','input_pop_name_text'); +h1 = uicontrol('Parent',h0, ... + 'Units','points', ... + 'BackgroundColor',[1 1 1], ... + 'Callback','plotflow edit_pop_name', ... + 'HorizontalAlignment','left', ... + 'ListboxTop',0, ... + 'Position',[32.25 86.25 201.75 17.25], ... + 'Style','edit', ... + 'Tag','pop_name_edit'); +h1 = uicontrol('Parent',h0, ... + 'FontWeight','bold', ... + 'Units','points', ... + 'Callback','plotflow five_back', ... + 'ListboxTop',0, ... + 'Position',[159.75 60 15 15], ... + 'String','<<', ... + 'Tag','five_back_button'); +h1 = uicontrol('Parent',h0, ... + 'FontWeight','bold', ... + 'Units','points', ... + 'Callback','plotflow one_back', ... + 'ListboxTop',0, ... + 'Position',[179.25 60 15 15], ... + 'String','<', ... + 'Tag','one_back_button'); +h1 = uicontrol('Parent',h0, ... + 'FontWeight','bold', ... + 'Units','points', ... + 'Callback','plotflow one_ahead', ... + 'ListboxTop',0, ... + 'Position',[199.5 60 15 15], ... + 'String','>', ... + 'Tag','one_ahead_button'); +h1 = uicontrol('Parent',h0, ... + 'FontWeight','bold', ... + 'Units','points', ... + 'Callback','plotflow five_ahead', ... + 'ListboxTop',0, ... + 'Position',[219.75 60 15 15], ... + 'String','>>', ... + 'Tag','five_ahead_button'); +if nargout > 0, fig = h0; end diff --git a/matlab/graph/seqNeighJoin.m b/matlab/graph/seqNeighJoin.m new file mode 100644 index 0000000..d63fe0c --- /dev/null +++ b/matlab/graph/seqNeighJoin.m @@ -0,0 +1,250 @@ +function t = seqNeighJoin(D, method, names, varargin) +%SEQNEIGHJOIN neighbor-joining method for phylogenetic tree reconstruction. +% +% TREE = SEQNEIGHJOIN(DIST) computes a phylogenetic tree object from the +% pairwise distances DIST between the species or products applying the +% neighbor-joining method. The input DIST is a matrix (or vector) such as +% is generated by SEQPDIST. +% +% TREE = SEQNEIGHJOIN(DIST,METHOD) selects the method to compute the +% distances of the new nodes to all other nodes at every iteration. The +% general expression to calculate the distances between the new node (n) +% (after joining i and j) and all others nodes (k) is given by: +% +% D(n,k) = a*D(i,k) + (1-a)*D(j,k) - a*D(n,i) - (1-a)*D(n,j) +% +% This expression guarantees to find the correct tree with additive data +% (a.k.a. minimum variance reduction). The options for METHOD are: +% +% 'equivar' --- Assumes equal variance and independence of +% (default) evolutionary distance estimates (a = 1/2). Such as +% in Studier and Keppler, JMBE (1988). +% 'firstorder' --- Assumes a first order model of the variances and +% covariances of evolutionary distance estimates, 'a' +% is adjusted at every iteration to a value between 0 +% and 1. Such as in Gascuel, JMBE (1997). +% 'average' --- New distances are the weighted average of previous +% distances, the branch distances are ignored: +% D(n,k) = [ D(i,k) + D(j,k) ] /2 +% As in the original neighbor-joining algorithm by +% Saitou and Nei, JMBE (1987) +% +% TREE = SEQNEIGHJOIN(DIST,METHOD,NAMES) passes a list of names to label +% the leaf nodes (e.g. species or products) in the phylogenetic tree +% object. NAMES can be a vector of structures with the fields 'Header' or +% 'Name' or a cell array of strings. In both cases the number of elements +% provided must comply with the number of samples used to generate the +% pairwise distances in DIST. +% +% TREE = SEQNEIGHJOIN(...,'REROOT',false) excludes rerooting the resulting +% tree. This is useful to observe the original linkage order followed by +% the algorithm. By default SEQNEIGHJOIN reroots the resulting tree using +% the mid-point method. +% +% Example: +% +% % Load a multiple alignment of amino acids: +% seqs = fastaread('pf00002.fa'); +% +% % Measure the 'Jukes-Cantor' pairwise distances: +% dist = seqpdist(seqs,'method','jukes-cantor','indels','pair'); +% +% % Build the phylogenetic using the neighbor-joining algorithm +% tree = seqneighjoin(dist,'equivar',seqs) +% view(tree) +% +% See also MULTIALIGN, PHYTREE, PHYTREE/REROOT, PHYTREE/VIEW, SEQLINKAGE, +% SEQPDIST. + +% References: +% [1] Saitou N, Nei M.The neighbor-joining method: a new method for +% reconstructing phylogenetic trees. Mol Biol Evol.(1987) 4(4):406-25 +% [2] Gascuel O. BIONJ: An improved version of the NJ algorithm based on a +% simple model of sequence data. Mol. Biol. Evol. (1997) 14:685-695 +% [3] Studier JA, Keppler KJ. A note on the neighbor-joining algorithm of +% Saitou and Nei. Mol Biol Evol. (1988) 5(6):729-31. + +% Copyright 2003-2006 The MathWorks, Inc. +% $Revision: 1.1.8.2 $ $Date: 2006/05/17 20:48:43 $ + +rerootTree = true; +checkForNames = true; + +% check the input distances +if isnumeric(D) + [m, n] = size(D); + if isvector(D) + n = numel(D); + m = (1+sqrt(1+8*n))/2; % number of leaves + if m ~= fix(m) + error('Bioinfo:seqneighjoin:DbadSize',... + 'Size of DIST not compatible with the output of the SEQPDIST function.'); + end + D = squareform(D); + elseif m~=n + error('Bioinfo:seqneighjoin:DnotSquare',... + 'Size of DIST not compatible with the output of the SEQPDIST function.'); + end +else + error('Bioinfo:seqneighjoin:DnotNumeric',... + 'DIST must be a numeric vector compatible with the output of the SEQPDIST function.'); +end + +% Selects appropiate method +if nargin == 1 + method = 'e'; % set default method + checkForNames = false; +else + okmethods = {'equivar','firstorder','average','reroot'}; + methodkeys = {'e','f','a'}; + k = find(strncmpi(method,okmethods,numel(method))); + if isempty(k) + error('Bioinfo:seqneighjoin:UnknownMethod',... + 'Unknown method name: %s.',method); + elseif length(k)>1 + error('Bioinfo:seqneighjoin:IncorrectMethod',... + 'Ambiguous method name: %s.',method); + elseif k<4 + method = methodkeys{k}; + else % case that second and third input are optional paired inputs + if numel(varargin) || nargin==2 + error('Bioinfo:seqneighjoin:IncorrectInputArguments',... + 'Incorrect format of input arguments.') + end + varargin = {method,names}; + checkForNames = false; + method = 'e'; % set default method + end +end + +% detects the names +if checkForNames && (nargin >2) % names were supplied, check validity + if iscell(names) || isfield(names,'Header') || isfield(names,'Name') || isfield(names,'LocusName') + if isfield(names,'Header') % if struct put them in a cell + names = {names(:).Header}; + elseif isfield(names,'Name') % if struct put them in a cell + names = {names(:).Name}; + elseif isfield(names,'LocusName') % if struct put them in a cell + names = {names(:).LocusName}; + end + names = names(:); + namesSupplied = true; + if numel(names)~=m + error('Bioinfo:seqneighjoin:IncorrectSize',... + 'NAMES must have the same size as number of leaves in the tree') + end + elseif strncmpi(names,{'reroot'},numel(names)) + varargin = {names varargin{:}}; + namesSupplied = false; + else + error('Bioinfo:seqneighjoin:IncorrectInputType',... + 'NAMES must be a cell with char arrays or a vector of structures.') + end +else + namesSupplied = false; +end + +% check optional input arguments +nvarargin = numel(varargin); +if nvarargin + if rem(nvarargin,2) + error('Bioinfo:seqneighjoin:IncorrectNumberOfArguments',... + 'Incorrect number of arguments to %s.',mfilename); + end + okargs = {'reroot',''}; + for j=1:2:nvarargin + pname = varargin{j}; + pval = varargin{j+1}; + k = find(strncmpi(pname,okargs,numel(pname))); + if isempty(k) + error('Bioinfo:seqneighjoin:UnknownParameterName',... + 'Unknown parameter name: %s.',pname); + elseif length(k)>1 + error('Bioinfo:seqneighjoin:AmbiguousParameterName',... + 'Ambiguous parameter name: %s.',pname); + else + rerootTree = opttf(pval); + if isempty(rerootTree) + error('Bioinfo:seqneighjoin:rerootInputOptionNotLogical',... + '%s must be a logical value, true or false.',... + upper(char(okargs(k)))); + end + end + end +end + +% ------------------------------------------------------------------------ +% Algorithm starts here: +% +% D - pairwise distance matrix (full form) +% method - 'a' for Saitou & Nei, 'e' for Studier & Keppler, and 'f' for +% Gascuel +% names - labels for leaf nodes + +N = size(D,1); +Z = zeros(N-1,2); +bd = zeros(N*2-1,1); + +p = 1:N; % pointers in matrix +bc=1; % branch counter +if method=='f' + V=D; %initialize the variance matrix for Gascuel method +end + +for n = N:-1:3 + R = sum(D)/(n-2); % sums of columns + Q = D-repmat(R,n,1)-repmat(R',1,n)+diag(inf(n,1)); %Studier & Keppler optimized + % S = (sum(sum(D))/(n-2)+Q)/2 % original total sum in Saitou & Nei + [m,g] = min(Q(:)); %#ok + [i,j] = ind2sub(n,g); % find minimum + if i>j + k=i;i=j;j=k; % j>i always + end + pp = p([i j]); % pointers to join + bl = (R([i j])*[1 -1;-1 1] + D(j,i))/2; % branch lengths + bl = max(bl,0); + bd(pp) = bl; % save branch lengths + Z(bc,:) = pp; % save pointers + h = [1:i-1 i+1:j-1 j+1:n]; + switch method % distances to new node + case 'a' % Saitou & Nei method + d = (sum(D(h,[i j]),2))/2; + D = [[D(h,h) d];[d' 0]]; % update distance matrix + case 'e' % Studier & Keppler method + d = (sum(D(h,[i j]),2)-D(j,i))/2; + if any(d<0) + d = max(d,0); + end + D = [[D(h,h) d];[d' 0]]; % update distance matrix + case 'f' % Gascuel method + if V(i,j) + lambda = max(0,min(1,(1+sum(V(h,i)-V(h,j))/(n-2)/V(i,j))/2)); + else + lambda = 1/2; + end + d = D(h,[i j])*[lambda;1-lambda] - bl*[lambda;1-lambda]; + if any(d<0) + d = max(d,0); + end + D = [[D(h,h) d];[d' 0]]; % update distance matrix + v = V(h,[i j])*[lambda;1-lambda] - V(i,j)*lambda*(1-lambda); + if any(v<0) + v = max(v,0); + end + V = [[V(h,h) v];[v' 0]]; % update variance matrix + end + p = [p(h),N+bc]; % update pointers + bc = bc+1; % update branch counter +end +Z(bc,:) = p; % pointers of last branch +bd(p) = D(2)/2; % lengths of last branch + +% convert data to a phylogenetic tree object +if namesSupplied + t = phyTree(Z,bd,names); +else + t = phyTree(Z,bd); +end +if rerootTree + t = reRoot(t); % reroot with mid-point method +end diff --git a/matlab/graph/seqlinkage.m b/matlab/graph/seqlinkage.m new file mode 100644 index 0000000..6d7ddf6 --- /dev/null +++ b/matlab/graph/seqlinkage.m @@ -0,0 +1,123 @@ +function t = seqlinkage(d, method, names) +%SEQLINKAGE constructs a phylogenetic tree from pairwise distances. +% +% TREE = SEQLINKAGE(DIST) computes a phylogenetic tree object from the +% pairwise distances DIST between the species or products. The input DIST +% is a matrix (or vector) such as is generated by SEQPDIST. +% +% TREE = SEQLINKAGE(DIST,METHOD) creates a phylogenetic tree object using +% a specified distance method. The available methods are: +% +% 'single' --- nearest distance (single linkage method) +% 'complete' --- furthest distance (complete linkage method) +% 'average' (default) --- unweighted average distance (UPGMA) (also +% known as group average) +% 'weighted' --- weighted average distance (WPGMA) +% 'centroid' --- unweighted center of mass distance (UPGMC) +% 'median' --- weighted center of mass distance (WPGMC) +% +% TREE = SEQLINKAGE(DIST,METHOD,NAMES) passes a list of names to label the +% leaf nodes (e.g. species or products) in the phylogenetic tree object. +% NAMES can be a vector of structures with the fields 'Header' or 'Name' +% or a cell array of strings. In both cases the number of elements +% provided must comply with the number of samples used to generate the +% pairwise distances in DIST. +% +% Example: +% +% % Load a multiple alignment of amino acids: +% seqs = fastaread('pf00002.fa'); +% +% % Measure the 'Jukes-Cantor' pairwise distances: +% dist = seqpdist(seqs,'method','jukes-cantor','indels','pair'); +% +% % Build the phylogenetic tree with the single linkage method and pass +% % the names of the sequences: +% tree = seqlinkage(dist,'single',seqs) +% view(tree) +% +% See also PHYTREE, PHYTREE/PLOT, PHYTREE/VIEW, PHYTREEWRITE, +% SEQNEIGHJOIN, SEQPDIST. + +% Copyright 2003-2005 The MathWorks, Inc. +% $Revision: 1.1.6.6 $ $Date: 2005/06/09 21:57:42 $ + +% check the input distances +[m, n] = size(d); + +if m==n && m>1 % transform to vector form + D = zeros(m*(m-1)/2,1); + k=1; + for j = 1:m-1 + for i = j+1:m + D(k) = d(i,j); + k = k+1; + end + end +else + D=d(:); +end + +D=D'; % linkage uses row form +n = numel(D); +m = (1+sqrt(1+8*n))/2; % number of leaves +if m ~= fix(m) + error('Bioinfo:BadSize',... + 'Size of DIST not compatible with the output of the SEQPDIST function.'); +end + +% Selects appropiate method +if nargin == 1 % set default switch to be 's' + method = 'av'; +else + okmethods = {'single','nearest',... + 'complete','farthest',... + 'average','upgma',... + 'weighted','wpgma',... + 'centroid','upgmc',... + 'median','wpgmc'}; + methodkeys = {'si','si','co','co','av','av','we','we','ce','ce',... + 'me','me','wa','wa'}; + s = strmatch(lower(method), okmethods); %#ok + if isempty(s) + error('Bioinfo:UnknownMethod','Unknown method name: %s.',method); + elseif length(s)>1 + error('Bioinfo:IncorrectMethod','Ambiguous method name: %s.',method); + else + method = methodkeys{s}; + end +end + +% detects the names +if nargin == 3 % names were supplied, check validity + if iscell(names) || isfield(names,'Header') || isfield(names,'Name') || isfield(names,'LocusName') + if isfield(names,'Header') % if struct put them in a cell + names = {names(:).Header}; + elseif isfield(names,'Name') % if struct put them in a cell + names = {names(:).Name}; + elseif isfield(names,'LocusName') % if struct put them in a cell + names = {names(:).LocusName}; + end + names = names(:); + namesSupplied = true; + if numel(names)~=m + error('Bioinfo:IncorrectSize',... + 'NAMES must have the same size as number of leaves in the tree') + end + else + error('Bioinfo:IncorrectInputType',... + 'NAMES must be a cell with char arrays or a vector of structures.') + end +else + namesSupplied = false; +end + +% call the stats linkage program +T = linkage(D,method); + +% convert data to a phylogenetic tree object +if namesSupplied + t = phyTree(T,names); +else + t = phyTree(T); +end diff --git a/matlab/graph/showColors.m b/matlab/graph/showColors.m new file mode 100644 index 0000000..1499a7b --- /dev/null +++ b/matlab/graph/showColors.m @@ -0,0 +1,16 @@ +function showColors(n_col) + +if n_col>36 + error('Maximum number of colors 36'); +end + +figure('NumberTitle','off','Name','Colors'); + +set(gca, 'Xlim', [-.5 , n_col+.5], 'YLim', [0,1], ... + 'XTick', [], 'XTickLabel', [], 'YTick', [], 'YTickLabel', []); + +varit = giveColors(n_col); +for k=1:length(varit) + h0=patch([k-1 k k k-1], [0 0 1 1], varit(k,:)); + set(h0,'EdgeColor','none'); +end \ No newline at end of file diff --git a/matlab/graph/showColors2.m b/matlab/graph/showColors2.m new file mode 100644 index 0000000..1499a7b --- /dev/null +++ b/matlab/graph/showColors2.m @@ -0,0 +1,16 @@ +function showColors(n_col) + +if n_col>36 + error('Maximum number of colors 36'); +end + +figure('NumberTitle','off','Name','Colors'); + +set(gca, 'Xlim', [-.5 , n_col+.5], 'YLim', [0,1], ... + 'XTick', [], 'XTickLabel', [], 'YTick', [], 'YTickLabel', []); + +varit = giveColors(n_col); +for k=1:length(varit) + h0=patch([k-1 k k k-1], [0 0 1 1], varit(k,:)); + set(h0,'EdgeColor','none'); +end \ No newline at end of file diff --git a/matlab/graph/statgetargs.m b/matlab/graph/statgetargs.m new file mode 100644 index 0000000..db021cc --- /dev/null +++ b/matlab/graph/statgetargs.m @@ -0,0 +1,83 @@ +function [eid,emsg,varargout]=statgetargs(pnames,dflts,varargin) +%STATGETARGS Process parameter name/value pairs for statistics functions +% [EID,EMSG,A,B,...]=STATGETARGS(PNAMES,DFLTS,'NAME1',VAL1,'NAME2',VAL2,...) +% accepts a cell array PNAMES of valid parameter names, a cell array +% DFLTS of default values for the parameters named in PNAMES, and +% additional parameter name/value pairs. Returns parameter values A,B,... +% in the same order as the names in PNAMES. Outputs corresponding to +% entries in PNAMES that are not specified in the name/value pairs are +% set to the corresponding value from DFLTS. If nargout is equal to +% length(PNAMES)+1, then unrecognized name/value pairs are an error. If +% nargout is equal to length(PNAMES)+2, then all unrecognized name/value +% pairs are returned in a single cell array following any other outputs. +% +% EID and EMSG are empty if the arguments are valid. If an error occurs, +% EMSG is the text of an error message and EID is the final component +% of an error message id. STATGETARGS does not actually throw any errors, +% but rather returns EID and EMSG so that the caller may throw the error. +% Outputs will be partially processed after an error occurs. +% +% This utility is used by some Statistics Toolbox functions to process +% name/value pair arguments. +% +% Example: +% pnames = {'color' 'linestyle', 'linewidth'} +% dflts = { 'r' '_' '1'} +% varargin = {{'linew' 2 'nonesuch' [1 2 3] 'linestyle' ':'} +% [eid,emsg,c,ls,lw] = statgetargs(pnames,dflts,varargin{:}) % error +% [eid,emsg,c,ls,lw,ur] = statgetargs(pnames,dflts,varargin{:}) % ok + +% Copyright 1993-2004 The MathWorks, Inc. +% $Revision: 1.4.2.1 $ $Date: 2003/11/01 04:28:41 $ + +% We always create (nparams+2) outputs: +% one each for emsg and eid +% nparams varargs for values corresponding to names in pnames +% If they ask for one more (nargout == nparams+3), it's for unrecognized +% names/values + +% Initialize some variables +emsg = ''; +eid = ''; +nparams = length(pnames); +varargout = dflts; +unrecog = {}; +nargs = length(varargin); + +% Must have name/value pairs +if mod(nargs,2)~=0 + eid = 'WrongNumberArgs'; + emsg = 'Wrong number of arguments.'; +else + % Process name/value pairs + for j=1:2:nargs + pname = varargin{j}; + if ~ischar(pname) + eid = 'BadParamName'; + emsg = 'Parameter name must be text.'; + break; + end + i = strmatch(lower(pname),pnames); + if isempty(i) + % if they've asked to get back unrecognized names/values, add this + % one to the list + if nargout > nparams+2 + unrecog((end+1):(end+2)) = {varargin{j} varargin{j+1}}; + + % otherwise, it's an error + else + eid = 'BadParamName'; + emsg = sprintf('Invalid parameter name: %s.',pname); + break; + end + elseif length(i)>1 + eid = 'BadParamName'; + emsg = sprintf('Ambiguous parameter name: %s.',pname); + break; + else + varargout{i} = varargin{j+1}; + end + end +end + +varargout{nparams+1} = unrecog; diff --git a/matlab/graph/viewDendrogram.m b/matlab/graph/viewDendrogram.m new file mode 100644 index 0000000..df37e65 --- /dev/null +++ b/matlab/graph/viewDendrogram.m @@ -0,0 +1,56 @@ +function viewDendrogram(action) +% VIEWDENDROGRAM function to called by VIEWNJ to draw dendrogram +% trees + +handle = gcf; +h0 = findobj(handle,'Tag','attr_menu'); +g = get(h0,'Userdata'); +g.visualtype = action; +set(h0,'Userdata',g); % store in the attribute menu +cla +axis off +% clf +% close(g.handle); +% h0 = figure('NumberTitle','off'); +% g.handle = h0; +% set(h0,'menubar','none','toolbar','figure'); +% set(h0,'Tag','nj_plot'); +% set(h0,'Name',['Neighbor-Joining tree - ' g.filename]);; +if strcmp(g.type,'NJ') + t = seqNeighJoin(g.D, 'equivar', g.varnames); +else + t = seqlinkage(g.D, 'average', g.varnames); +end + +switch action + case 'square' + % plotNJ(g.D, 1, char(g.varnames)); + Plot(t,'type','square'); + case 'angular' + % plotNJ(g.D, 0, char(g.varnames)); + Plot(t,'type','angular'); +end + +% h1 = uimenu('Parent',handle, ... +% 'Label','Attributes', ... +% 'Tag','attr_menu'); +% h2 = uimenu('Parent',h1, ... +% 'Label','Rename clusters', ... +% 'callback', 'plotflow rename', ... +% 'Tag','clustername_menu'); +% h2 = uimenu('Parent',h1, ... +% 'Label','Visual type', ... +% 'Tag','visualtype_menu'); +% h3 = uimenu('Parent',h2, ... +% 'Label','Square', ... +% 'callback', 'viewDendrogram(''square'')', ... +% 'Tag','viewsquare_menu'); +% h3 = uimenu('Parent',h2, ... +% 'Label','Angular', ... +% 'callback', 'viewDendrogram(''angular'')', ... +% 'Tag','viewangular_menu'); +% h3 = uimenu('Parent',h2, ... +% 'Label','Radial', ... +% 'callback', 'viewUnrooted', ... +% 'Tag','viewradial_menu'); +%set(h1,'Userdata',g); % store in the attribute menu \ No newline at end of file diff --git a/matlab/graph/viewMixPartition.m b/matlab/graph/viewMixPartition.m new file mode 100644 index 0000000..150e19b --- /dev/null +++ b/matlab/graph/viewMixPartition.m @@ -0,0 +1,88 @@ +function viewMixPartition(partition, popnames) + +notEmptyPops = length(unique(partition)); +if notEmptyPops>30 + disp(['Number of populations: ' num2str(notEmptyPops)]); + disp(' '); + disp('Figure can be drawn only if the number of populations'); + disp('is less or equal to 30.'); + disp(' '); + return; +end + +nind = length(partition); +%npops = max(partition); +npops = notEmptyPops; + +varit = giveColors(npops); +korkeinviiva = 1.05; +pieninarvo = -korkeinviiva; + +h0 = figure; +set(h0, 'NumberTitle', 'off'); %image_figure; %Muutettu +tiedot.popnames = popnames; +tiedot.info = partition; +set(h0,'UserData',tiedot); + +set(gca, 'Xlim', [-.5 ,nind+.5], 'YLim', [pieninarvo ,korkeinviiva], ... + 'XTick', [], 'XTickLabel', [], 'YTick', [], 'YTickLabel', []); + +eiTyhjatPopulaatiot = unique(partition); + +for i=1:nind + % Suhteellisten osuuksien laskeminen + pop = partition(i); + pop = find(eiTyhjatPopulaatiot==pop); + + % Pylvään piirtäminen + h0 =patch([i-1, i, i, i-1], [0, 0, 1, 1], varit(pop,:)); + set(h0,'EdgeColor','none'); % Midevaa varten kommentoitava! + +end + + + +if ~isempty(popnames) + npops = size(popnames,1); + for i=1:npops + firstInd = popnames{i,2}; + if size(popnames,1) ~=nind + line([firstInd-1, firstInd-1], [0,1], 'Color', 'k'); %Populaatioiden rajat + end + if i30 + disp(' '); + disp('Figure can be drawn only if the number of populations'); + disp('is less or equal to 30.'); + disp(' '); + return; +end + + +varit = giveColors(npops); +korkeinviiva = 1.05; +pieninarvo = -korkeinviiva; + + +h0 = figure; +set(h0, 'NumberTitle', 'off'); %image_figure; %Muutettu +tiedot.popnames = popnames; +tiedot.info = osuudet; +set(h0,'UserData',tiedot); + +set(gca, 'Xlim', [-.5 ,nind+.5], 'YLim', [pieninarvo ,korkeinviiva], ... + 'XTick', [], 'XTickLabel', [], 'YTick', [], 'YTickLabel', []); + +for i=1:nind + + if any(osuudet(i,:)>0) + cumOsuudet = cumsum(osuudet(i,:)); + + % Pylvään piirtäminen + for j=1:npops + if j==1 + if cumOsuudet(1)>0 + h0 =patch([i-1, i, i, i-1], [0, 0, cumOsuudet(1), cumOsuudet(1)], varit(j,:)); + set(h0,'EdgeColor','none'); % Midevaa varten kommentoitava! + end + else + if (cumOsuudet(j)>cumOsuudet(j-1)) + h0 = patch([i-1, i, i, i-1], [cumOsuudet(j-1), cumOsuudet(j-1), ... + cumOsuudet(j), cumOsuudet(j)], varit(j,:)); + set(h0,'EdgeColor','none'); % Midevaa varten kommentoitava! + end + end + end + end +end + + + +if ~isempty(popnames) + npops = size(popnames,1); + for i=1:npops + firstInd = popnames{i,2}; + line([firstInd-1, firstInd-1], [0,1], 'Color', 'k'); %Populaatioiden rajat + + if i30 + disp(' '); + disp('Figure can be drawn only if the number of populations'); + disp('is less or equal to 30.'); + disp(' '); + return; +end + +varit = giveColors(npops); +korkeinviiva = 1.05; +pieninarvo = -korkeinviiva; + +h0 = figure; +set(h0, 'NumberTitle', 'off'); %image_figure; %Muutettu +set(h0,'Tag','admixture_result'); +set(h0,'Name',['Admixture Result - ' filename]); +tiedot.popnames = popnames; +tiedot.info = osuudet; +set(h0,'UserData',tiedot); + +set(gca, 'Xlim', [-.5 ,nind+.5], 'YLim', [pieninarvo ,korkeinviiva], ... + 'XTick', [], 'XTickLabel', [], 'YTick', [], 'YTickLabel', []); + +for i=1:nind + + if any(osuudet(i,:)>0) + cumOsuudet = cumsum(osuudet(i,:)); + + % Pylvään piirtäminen + for j=1:npops + if j==1 + if cumOsuudet(1)>0 + h0 =patch([i-1, i, i, i-1], [0, 0, cumOsuudet(1), cumOsuudet(1)], varit(j,:)); + set(h0,'EdgeColor','none'); % Midevaa varten kommentoitava! + end + else + if (cumOsuudet(j)>cumOsuudet(j-1)) + h0 = patch([i-1, i, i, i-1], [cumOsuudet(j-1), cumOsuudet(j-1), ... + cumOsuudet(j), cumOsuudet(j)], varit(j,:)); + set(h0,'EdgeColor','none'); % Midevaa varten kommentoitava! + end + end + end + end +end + + + +if ~isempty(popnames) + npops = size(popnames,1); + for i=1:npops + firstInd = popnames{i,2}; + if size(popnames,1)~=nind + line([firstInd-1, firstInd-1], [0,1], 'Color', 'k'); %Populaatioiden rajat + end + + % The determination of x_paikka is changed, since popnames could be + % non-sequential. - Jing + if i0); +% end +% % 'adjacency matrix' +% adj_mat = cell(ncluster,ncluster); +% k = 2; +% for i = 1:ncluster +% for j = k:ncluster +% adj_mat{i,j} = intersect(clusters{i},clusters{j}); +% end +% k = k + 1; +% end +% +% adj_mat = zeros(ncluster,ncluster); +% k = 2; +% for i = 1:ncluster +% for j = k:ncluster +% adj_mat(i,j) = length(intersect(clusters{i},clusters{j})); +% end +% k = k + 1; +% end +% adj_mat = adj_mat+adj_mat'; +% admix_count = sum(adj_mat,1); +% order = zeros(1,ncluster); +% middle_cluster = find(recomb_count==max(admix_count)); +% reference_cluster = middle_cluster; +% middle_point = floor(ncluster/2); +% order(middle_point) = middle_cluster(1); +% for i = middle_point+1:ncluster +% order(i) = find(adj_mat(:,reference_cluster)==max(adj_mat(:,reference_cluster))); +% adj_mat(referecen_cluster,order(i)) = 0; +% adj_mat(order(i),reference_cluster) = 0; +% reference_cluster = order(i); +% end +% [ordered_cluster,ix] = sort(size_cluster,'descend'); + +% disp(['Number of populations: ' num2str(npops)]); +% if npops>30 +% disp(' '); +% disp('Figure can be drawn only if the number of populations'); +% disp('is less or equal to 30.'); +% disp(' '); +% return; +% end + + +varit = giveColors(npops); +korkeinviiva = 1.05; +pieninarvo = -korkeinviiva; + +h0 = figure; +set(h0, 'NumberTitle', 'off'); %image_figure; %Muutettu +set(h0,'Tag','admixture_result'); +set(h0,'Name',['Admixture Result After Permutation - ' filename]); + +if size(popnames,1) > 150 % display only groups + for i=1:nind + popnames{i,1} = cell({'|'}); % CELL in the CELL + end +end + +for i = inliers + pool = find(partition==i); + if isempty(pool) + continue + end + chosen_ix = 1 + floor(length(pool)/2); +% if ~find(pool==chosen_one) % gap, find the other way +% chosen_one = pool(end) - floor(length(pool)/2); +% if ~find(pool==chosen_one) % then the algorithm really sucks +% chosen_one = pool(1); +% end +% end + chosen_one = pool(chosen_ix); + popnames{chosen_one,1} = {[popnames{chosen_one,1}{1} sprintf('Cluster%d',i)]}; +end + + +tiedot.popnames = popnames; +tiedot.info = data; +set(h0,'UserData',tiedot); + +set(gca,'Xlim', [-.5 ,nind+.5], 'YLim', [pieninarvo ,korkeinviiva], ... + 'XTick', [], 'XTickLabel', [], 'YTick', [], 'YTickLabel', []); + +% Display the color bars +for i=1:nind + + if any(data(i,:)>0) + cumOsuudet = cumsum(data(i,:)); + + % Pylvään piirtäminen + for j=1:admixnpops + if j==1 + if cumOsuudet(1)>0 + h0 =patch([i-1, i, i, i-1], [0, 0, cumOsuudet(1), cumOsuudet(1)], varit(inliers(j),:)); + set(h0,'EdgeColor','none'); % Midevaa varten kommentoitava! + end + else + if (cumOsuudet(j)>cumOsuudet(j-1)) + h0 = patch([i-1, i, i, i-1], [cumOsuudet(j-1), cumOsuudet(j-1), ... + cumOsuudet(j), cumOsuudet(j)], varit(inliers(j),:)); + set(h0,'EdgeColor','none'); % Midevaa varten kommentoitava! + end + end + end + end +end + + +% Display the texts +if ~isempty(popnames) + npops = size(popnames,1); % NB! npops now stands for the number of individuals. + for i=1:npops + firstInd = popnames{i,2}; + if size(popnames,1)~=nind + line([firstInd-1, firstInd-1], [0,1], 'Color', 'k'); %Populaatioiden rajat + end + + % The determination of x_paikka is changed, since popnames could be + % non-sequential. - Jing + if i36 + disp(' '); + disp('Figure can be drawn only if the number of populations'); + disp('is less or equal to 36.'); + disp(' '); + return; +end + +varit = giveColors(npops); +korkeinviiva = 1.05; +pieninarvo = -korkeinviiva; + +h0 = figure; +set(h0, 'NumberTitle', 'off'); %image_figure; %Muutettu +set(h0,'Tag','admixture_result'); +set(h0,'Name',['Admixture Result - ' filename]); +tiedot.popnames = popnames; +tiedot.info = osuudet; +set(h0,'UserData',tiedot); + +set(gca, 'Xlim', [-.5 ,nind+.5], 'YLim', [pieninarvo ,korkeinviiva], ... + 'XTick', [], 'XTickLabel', [], 'YTick', [], 'YTickLabel', []); + +for i=1:nind + + if any(osuudet(i,:)>0) + cumOsuudet = cumsum(osuudet(i,:)); + + % Pylvään piirtäminen + for j = 1:admixnpops + if j == 1 + if cumOsuudet(1)>0 + h0 =patch([i-1, i, i, i-1], [0, 0, cumOsuudet(1), cumOsuudet(1)], varit(inliers(j),:)); + set(h0,'EdgeColor','none'); % Midevaa varten kommentoitava! + end + else + if (cumOsuudet(j)>cumOsuudet(j-1)) + h0 = patch([i-1, i, i, i-1], [cumOsuudet(j-1), cumOsuudet(j-1), ... + cumOsuudet(j), cumOsuudet(j)], varit(inliers(j),:)); + set(h0,'EdgeColor','none'); % Midevaa varten kommentoitava! + end + end + end + end +end + + + +if ~isempty(popnames) + npops = size(popnames,1); % NB! npops now stands for the number of individuals. + for i = 1:npops + firstInd = popnames{i,2}; + if size(popnames,1)~=nind + line([firstInd-1, firstInd-1], [0,1], 'Color', 'k'); %Populaatioiden rajat + end + + % The determination of x_paikka is changed, since popnames could be + % non-sequential. - Jing + if i30 + disp(' '); + disp('Figure can be drawn only if the number of populations'); + disp('is less or equal to 30.'); + disp(' '); + return; +end + +nind = length(partition); +totalNumRows = 0; +for ind = 1:nind + totalNumRows = totalNumRows+rows(ind,2)-rows(ind,1)+1; + %totalNumRows = totalNumRows+length(rows{ind}); +end +%npops = max(partition); +npops = notEmptyPops; + +varit = giveColors(npops); +korkeinviiva = 1.05; +pieninarvo = -korkeinviiva; + +h0 = figure('NumberTitle', 'off'); %image_figure; %Muutettu +tiedot.rows = rows; +tiedot.info = partition; +tiedot.popnames = popnames; +set(h0,'UserData',tiedot); + +set(gca, 'Xlim', [-.5 ,totalNumRows+.5], 'YLim', [pieninarvo ,korkeinviiva], ... + 'XTick', [], 'XTickLabel', [], 'YTick', [], 'YTickLabel', []); + +eiTyhjatPopulaatiot = unique(partition); + +for i=1:nind + pop = partition(i); + pop = find(eiTyhjatPopulaatiot==pop); + + % Pylväiden piirtäminen + for rivi = rows(i,1):rows(i,2) + h0 =patch([rivi-1, rivi, rivi, rivi-1], [0, 0, 1, 1], varit(pop,:)); + set(h0,'EdgeColor','none'); % Midevaa varten kommentoitava! + end +end + + +npops = size(rows,1); +for i=1:npops + firstRow = rows(i,1); + line([firstRow-1, firstRow-1], [0,1], 'Color', 'k'); %Populaatioiden rajat +end + +if ~isempty(popnames) + for i=1:npops + %rivi = rows(i}; + %x_paikka = (rivi(1)-1+rivi(end))/2; + x_paikka = (rows(i,1) - 1 + rows(i,2)) / 2; + + korkeuskerroin = pieninarvo / -0.2; + suhdekerroin = npops/6; + for letter_num = 1:length(popnames{i,1}{1}) + letter= popnames{i,1}{1}(letter_num);%alter .004| + text(x_paikka+korjaus(letter)*suhdekerroin, ... + 0.0005*korkeuskerroin-0.02*letter_num*korkeuskerroin, ... + letter, 'Interpreter','none'); + end + end +end + +line([totalNumRows,totalNumRows],[0,1],'Color','k'); + +%------------------------------------------------------------------------------------- + +function extra = korjaus(letter) + if any(letter == 'ijlI') + extra = 0.022; + elseif any(letter == 'r') + extra = 0.016; + elseif any(letter == 'k') + extra = 0.009; + elseif any(letter == 'f') + extra = 0.013; + elseif any(letter == 't') + extra = 0.014; + elseif any(letter == 'w') + extra = -0.003; + else + extra = 0; +end; diff --git a/matlab/graph/viewUnrooted.m b/matlab/graph/viewUnrooted.m new file mode 100644 index 0000000..85a5a12 --- /dev/null +++ b/matlab/graph/viewUnrooted.m @@ -0,0 +1,61 @@ +function viewUnrooted(action) +% VIEWUNROOTED function to called by VIEWNJ to draw unrooted +% neighbor-joining trees + +handle = gcf; +h0 = findobj(handle,'Tag','attr_menu'); +g = get(h0,'Userdata'); +g.visualtype = action; +set(h0,'Userdata',g); % store in the attribute menu +cla reset +axis off +% close(g.handle); +% h0 = figure('NumberTitle','off'); +% g.handle = h0; +% set(h0,'menubar','none','toolbar','figure'); +% set(h0,'Tag','nj_plot'); +% set(h0,'Name',['Neighbor-Joining tree - ' g.filename]);; + +if strcmp(g.type,'NJ') + t = seqNeighJoin(g.D, 'equivar', g.varnames); +else + t = seqlinkage(g.D, 'average', g.varnames); +end + +switch action + case 'radial' + % plotNJ(g.D, 1, char(g.varnames)); + Plot(t,'type','radial'); + case 'phylogram' + % plotNJ(g.D, 0, char(g.varnames)); + plotPhytree(t); +end + +% h1 = uimenu('Parent',handle, ... +% 'Label','Attributes', ... +% 'Tag','attr_menu'); +% h2 = uimenu('Parent',h1, ... +% 'Label','Rename clusters', ... +% 'callback', 'plotflow rename', ... +% 'Tag','clustername_menu'); +% h2 = uimenu('Parent',h1, ... +% 'Label','Visual type', ... +% 'Tag','visualtype_menu'); +% h3 = uimenu('Parent',h2, ... +% 'Label','Square', ... +% 'callback', 'viewDendrogram(''square'')', ... +% 'Tag','viewsquare_menu'); +% h3 = uimenu('Parent',h2, ... +% 'Label','Angular', ... +% 'callback', 'viewDendrogram(''angular'')', ... +% 'Tag','viewangular_menu'); +% h3 = uimenu('Parent',h2, ... +% 'Label','Radial', ... +% 'callback', 'viewUnrooted(''radial'')', ... +% 'Tag','viewradial_menu'); +% h3 = uimenu('Parent',h2, ... +% 'Label','Phylogram', ... +% 'callback', 'viewUnrooted(''phylogram'')', ... +% 'Tag','viewphylogram_menu'); +% set(h1,'Userdata',g); % store in the attribute menu + diff --git a/matlab/graph/view_admixture.m b/matlab/graph/view_admixture.m new file mode 100644 index 0000000..41e51e3 --- /dev/null +++ b/matlab/graph/view_admixture.m @@ -0,0 +1,254 @@ +function view_admixture(proportionsIt,npops,admixnpops,... + popnames,partition,pvalue,source) + +% Find the outlier individuals +% removed_inds = find(~any(proportionsIt,2)); +removed_inds = logical(~any(proportionsIt,2)); +outliers = unique(partition(removed_inds)); +if ~isempty(outliers) + fprintf('Outlier clusters: '); + for i = 1:length(outliers) + fprintf(['%s' blanks(2)], num2str(outliers(i))); + end + fprintf('\n'); +end + +% total_clusters = [1:max(partition)]; +total_clusters = [1:npops]; +inliers = setdiff(total_clusters, outliers); +if length(inliers)~=admixnpops + disp('*** ERROR: in view linkage admixture.'); + return +end + +if isempty(popnames) || size(popnames,1)==size(partition,1) + + groupnames = cell(1,admixnpops); + for i=1:admixnpops + groupnames{i} = sprintf('Cluster %d',inliers(i)); + end + waitALittle; + [s,v] = listdlg('PromptString',[sprintf('%d ',admixnpops) 'Clusters available:'],... + 'SelectionMode','multiple',... + 'Name','Select Clusters',... + 'ListString',groupnames); + if isempty(s) || ~v + disp('*** WARNING: Viewing admixture cancelled.'); + return + else + fprintf('Viewing admixture in cluster(s): %s\n',num2str(inliers(s))); + end + + waitALittle; + answer = inputdlg( ['Input the p value for the strain admixture. Strains with '... + 'nonsignificant admixture will be displayed as a single color bar'],... + 'Input the upper bound p value',1,{'0.05'}); + if isempty(answer) % cancel has been pressed + return + else + maxp = str2num(answer{1}); + fprintf('P value: %4.2f\n', maxp); + + % nonsignificant strains are forced to be single color bar. + nonsignificant_ix = find( (pvalue>maxp & pvalue <=1) ); + if ~isempty(nonsignificant_ix) + for i = nonsignificant_ix' + % incluster = find(proportionsIt(i,:)==max(proportionsIt(i,:))); + incluster = logical(proportionsIt(i,:)==max(proportionsIt(i,:)) & proportionsIt(i,:)~=0); + proportionsIt(i,:) = zeros(1, admixnpops); + proportionsIt(i,incluster) = 1; + end + end + + ix = find(ismember(partition(:),inliers(s))); + % if isempty(ix) + % disp('*** ERROR: No significant data. Try again with a higher p value.'); + % return + % end + proportionsIt_raw = proportionsIt; + partition_raw = partition; + proportionsIt = proportionsIt(ix,:); + if isempty(popnames) || size(popnames,1)==size(partition,1) + popnames = popnames(ix,:); + end + partition = partition(ix); + pvalue = pvalue(ix); + end +else + waitALittle; + answer = inputdlg( ['Input the p value for the strain admixture. Strains with '... + 'nonsignificant admixture will be displayed as a single color bar'],... + 'Input the upper bound p value',1,{'0.05'}); + if isempty(answer) % cancel has been pressed + return + else + maxp = str2num(answer{1}); + fprintf('P value: %4.2f\n', maxp); + + % nonsignificant strains are forced to be single color bar. + nonsignificant_ix = find( (pvalue>maxp & pvalue <=1) ); + if ~isempty(nonsignificant_ix) + for i = nonsignificant_ix' + % incluster = find(proportionsIt(i,:)==max(proportionsIt(i,:))); + incluster = logical(proportionsIt(i,:)==max(proportionsIt(i,:)) & proportionsIt(i,:)~=0); + proportionsIt(i,:) = zeros(1, admixnpops); + proportionsIt(i,incluster) = 1; + end + end + + % ix = find(ismember(partition(:),inliers(s))); + % if isempty(ix) + % disp('*** ERROR: No significant data. Try again with a higher p value.'); + % return + % end + % proportionsIt_raw = proportionsIt; + % partition_raw = partition; + % proportionsIt = proportionsIt(ix,:); +% if isempty(popnames) || size(popnames,1)==size(partition,1) +% popnames = popnames(ix,:); +% end +% partition = partition(ix); +% pvalue = pvalue(ix); + end + +end + + + +talle = questdlg(['Do you want names to be visible in the admixture ' ... + 'result graphics?'], 'Names visible?', 'Yes', 'No', 'Yes'); + +if isequal(talle,'No') + if isempty(popnames) || size(popnames,1)==size(partition,1) + viewPartition4(proportionsIt, [], npops, ... + admixnpops, inliers, partition, source); + else + viewPartition2(proportionsIt, [], admixnpops, partition, source); + end +else + if isempty(popnames) || size(popnames,1)==size(partition,1) + viewPartition4(proportionsIt, popnames, npops, ... + admixnpops, inliers, partition, source); + else + viewPartition2(proportionsIt, popnames, admixnpops, partition, source); + end +end + +if isempty(popnames) || size(popnames,1)==size(partition,1) + talle = questdlg(['Permutate the labelling orders to get a more structured graphics? '], 'Reorder the sample?', 'Yes', 'No', 'Yes'); + + if isequal(talle,'Yes') + h0 = findobj('Tag','admixture_result'); + if ~isempty(h0) + close(h0); + % drawnow; + waitALittle; + [partition_ordered, proportionsIt,popnames,pvalue] = ... + viewPartition3(proportionsIt, popnames, npops, admixnpops, ... + inliers, partition, pvalue, source); + % c.mixtureType = 'linkage_mix'; + % c.npops = npops; + talle = questdlg('Save the ordered admixture result in text?', ... + 'Save results?','Yes','No','Yes'); + if isequal(talle,'Yes') + waitALittle; + [filename, pathname] = uiputfile('*.txt','Save results as'); + if isempty(filename) && isempty(pathname) + % Cancel was pressed + return + else + fprintf(1,'Saving the admixture result...\n'); + dest = [pathname filename]; + clusters = inliers(s); + tulostaAdmixtureTiedot(popnames, proportionsIt, pvalue, source, dest,... + maxp,clusters); + end + end + + else + disp('***ERROR: In plotting the admixture results.'); + return; + end + drawnow; + end +end + + +%-------------------------------------------------------------------------- + + +function tulostaAdmixtureTiedot(popnames, proportions, uskottavuus, ... + source, dest, maxp,clusters) + +fid = fopen(dest,'w'); +ninds = length(uskottavuus); +npops = size(proportions,2); + +if fid ~= -1 + fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['--------------------------------------------']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['RESULTS OF ADMIXTURE ANALYSIS BASED']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['ON MIXTURE CLUSTERING OF INDIVIDUALS']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Source file: ' source]); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Total number of clusters: ' npops]); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Viewing admixture in cluster(s): ',num2str(clusters)]);fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Number of individuals in selection: ' num2str(ninds)]); fprintf(fid, '\n'); + fprintf(fid,'P value: %4.2f \n', maxp); fprintf(fid, '\n'); + fprintf(fid, '\n'); +end + +namelength = zeros(1,ninds); +for ind = 1:ninds + namelength(ind) = length(popnames{ind,1}{1}); +end +maxlength = max(namelength); +ekaRivi = ['Index' blanks(2) blanks(maxlength-4) 'Name' blanks(2)]; +for pop = 1:npops + ekaRivi = [ekaRivi blanks(3-floor(log10(pop))) num2str(pop) blanks(2)]; +end +ekaRivi = [ekaRivi blanks(2) 'p']; % Added on 29.08.06 +disp(ekaRivi); +fprintf(fid, '%s \n',ekaRivi); fprintf(fid,'\n'); +for ind = 1:ninds + index = num2str(popnames{ind,2}); + namelength=['%' num2str(2+maxlength) 's']; + rivi = [sprintf('%5s',index) sprintf(namelength,popnames{ind,1}{1})... + blanks(2)]; + if any(proportions(ind,:)>0) + for pop = 1:npops-1 + rivi = [rivi proportion2str(proportions(ind,pop)) blanks(2)]; + end + rivi = [rivi proportion2str(proportions(ind,npops))]; + rivi = [rivi blanks(2) ownNum2Str(uskottavuus(ind))]; + end + disp(rivi); + if fid ~= -1 + fprintf(fid,'%s \n',rivi); fprintf(fid,'\n'); + end +end +fclose(fid); +fprintf(1,'finished.\n'); + +%-------------------------------------------------------------------------- +% function dispLine +% disp('---------------------------------------------------'); + +%-------------------------------------------------------------------------- +function str = proportion2str(prob) +%prob belongs to [0.00, 0.01, ... ,1]. +%str is a 4-mark presentation of proportion. + +if abs(prob)<1e-3 + str = '0.00'; +elseif abs(prob-1) < 1e-3; + str = '1.00'; +else + prob = round(100*prob); + if prob<10 + str = ['0.0' num2str(prob)]; + else + str = ['0.' num2str(prob)]; + end; +end; + + diff --git a/matlab/graph/view_energy.m b/matlab/graph/view_energy.m new file mode 100644 index 0000000..a88dcc2 --- /dev/null +++ b/matlab/graph/view_energy.m @@ -0,0 +1,488 @@ +function view_energy() + +%waitALittle; +% [filename1, pathname1] = uigetfile('*.mat', 'Load mixture results.'); +% if (sum(filename1)==0) || (sum(pathname1)==0) +% return; +% end +% struct_array = load([pathname1 filename1]); + +h0 = findobj('Tag','load_menu'); +c = get(h0,'UserData'); +h0 = findobj('Tag','filename1_text'); +filename1 = get(h0,'String'); +disp('---------------------------------------------------'); +disp('Viewing the energy lanscape (Kolmogorov–Smirnov test).'); +disp(['Load the mixture result from: ',[filename1],'...']); + + if isfield(c,'c') %Matlab versio + c = c.c; + if ~isfield(c,'mixtureType') + disp('*** ERROR: Incorrect file format'); + return + end + %else + % disp('*** ERROR: Incorrect file format'); + % return; + end +if ~isfield(c,'changesInLogml') + disp('*** WARNING: Old mixture format detected.'); + if 0 % Disabled 12/06/07, Jukka Siren + switch c.mixtureType + case 'linkage_mix' + %waitALittle; + [filename, pathname] = uigetfile('*.mat', 'Load the corresponding preprocessed data.'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + struct_array = load([pathname filename]); + disp('The corresponding preprocessed data is needed.'); + disp(['Load the preprocessed data from: ',[pathname filename], '...']); + if isfield(struct_array,'c') + c_raw = struct_array.c; + if ~all(size(c_raw.adjprior)==size(c.adjprior)) ||... + ~all(size(c_raw.data)==size(c.data)) + disp('Mixture result and preprocessed data do not match.'); + return; + end + + if ~isfield(c_raw,'linkage_model') + model_entry = input('Specify the linkage model(1-linear 2-codon):'); + switch model_entry, + case 1 + linkage_model = 'linear'; + case 2 + linkage_model = 'codon'; + end + else + linkage_model = c_raw.linkage_model; + end + else + disp('*** ERROR: Incorrect file format'); + return; + end + disp('Start calculating log likelihood. Please wait...'); + npops = c.npops; + partition = c.PARTITION; + cq_counts = c.CQ_COUNTS; + cq_sumcounts = c.CQ_SUMCOUNTS; + sp_counts = c.SP_COUNTS; + sp_sumcounts = c.SP_SUMCOUNTS; + + % Shrink the raw data + noalle_cq = size(c.adjprior_cq,1); + informative_cq = logical(sum(c.adjprior_cq)~=noalle_cq); + noalle_sp = size(c.adjprior_sp,1); + informative_sp = logical(sum(c.adjprior_sp)~=noalle_sp); + cq_counts = uint16(cq_counts(:,informative_cq,:)); + cq_sumcounts = uint16(cq_sumcounts(:,informative_cq)); + sp_counts = uint16(sp_counts(:,informative_sp,:)); + sp_sumcounts = uint16(sp_sumcounts(:,informative_sp)); + + data = c_raw.data; + ninds = size(data,1); + component_mat = c_raw.component_mat; + clear c_raw; + index = data(:,end); + [data_clique, data_separator, noalle_clique, noalle_separator] = ... + transform4(data, component_mat, linkage_model); + + if length(noalle_clique)~=length(find(informative_cq)) + disp('*** ERROR: The linkage model is not consistent with the data.'); + return + end + + data_clique = [data_clique index]; + data_separator = [data_separator index]; + + [counts_cq, nalleles_cq, prior_cq, adjprior_cq]... + = allfreqsnew2(data_clique, double(noalle_clique)); + clear data_clique; + [counts_sp, nalleles_sp, prior_sp, adjprior_sp]... + = allfreqsnew2(data_separator, double(noalle_separator)); + clear data_separator; + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + + pop_logml = computePopulationLogml(double(cq_counts), double(cq_sumcounts),... + double(sp_counts), double(sp_sumcounts),... + [1:npops], adjprior_cq, adjprior_sp); + + changesInLogml = zeros(npops,ninds); + for ind = 1:ninds + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + changesInLogml(:,ind) = computeChanges(cq_counts, cq_sumcounts,... + sp_counts, sp_sumcounts,... + partition, pop_logml,... + ind, adjprior_cq, ... + adjprior_sp, indCqCounts, indSpCounts); + end + case 'mix' % Independence model. + %waitALittle; + [filename, pathname] = uigetfile('*.mat', 'Load the corresponding preprocessed data.'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + struct_array = load([pathname filename]); + disp('The corresponding preprocessed data is needed.'); + disp(['Load the preprocessed data from: ',[pathname filename], '...']); + if isfield(struct_array,'c') + c_raw = struct_array.c; + if ~all(size(c_raw.adjprior)==size(c.adjprior)) ||... + ~all(size(c_raw.data(:,[1:end-1]))==size(c.data)) + disp('Mixture result and preprocessed data do not match.'); + return; + end + else + disp('*** ERROR: Incorrect file format'); + return; + end + + + npops = c.npops; + pops = 1:npops; + rowsFromInd = c.rowsFromInd; + data = c.data; + ninds = size(data,1)/rowsFromInd; + adjprior = c.adjprior; + priorTerm = c_raw.priorTerm; + COUNTS = c.COUNTS; + SUMCOUNTS = c.SUMCOUNTS; + PARTITION = c.PARTITION; + POP_LOGML = computePopulationLogml2(pops, adjprior, priorTerm, ... + COUNTS, SUMCOUNTS); + changesInLogml = zeros(npops,ninds); + for ind = 1:ninds + [muutokset, diffInCounts] = laskeMuutokset(ind, rowsFromInd, data, ... + adjprior, priorTerm, COUNTS, SUMCOUNTS, PARTITION, POP_LOGML); + changesInLogml(:,ind) = muutokset; + end + otherwise + disp('This model is under construction.'); + return + end + + + c.changesInLogml = changesInLogml; + + fprintf(1,'Saving the result...') + %waitALittle; + save_preproc = questdlg('Do you wish to save the updated mixture result?',... + 'Save mixture results?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %waitALittle; + [filename, pathname] = uiputfile('*.mat','Save mixture result as'); + if isempty(filename) && isempty(pathname) + return; + else + kokonimi = [pathname filename]; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + filename1 = filename; + end + + end; + fprintf(1,'Finished.\n'); + end + return +end + +if isequal(c.mixtureType,'spatialPop') || isequal(c.mixtureType,'popMix') + % Group level clustering, using the correct partition + view_density(c.changesInLogml, c.groupPartition, filename1); +else + view_density(c.changesInLogml, c.PARTITION, filename1); +end + + +% ------------------------------------------------------------------------- +function view_density(changesInLogml, partition, filename) +npops = size(changesInLogml,1); +groupnames = cell(1,npops+1); +for i=1:npops + groupnames{i} = sprintf('Cluster %d',i); +end + +disp('Genetic affinity matrix (row=source col=target): test statistic(p value) '); +ekarivi = 'Cluster '; +for i = 1:npops + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; +end +disp(ekarivi); + + +for i=1:npops + indsi = logical(partition==i); % choose cluster i + nindsi = sum(indsi); + rivi = [blanks(4-floor(log10(i))) num2str(i) ':']; + + h = zeros(npops,1); % indicator 0 same 1 different + h1 = h; + p = zeros(npops,1); % p values + k = zeros(npops,1); % test statistic + k1 = k; + p2 = zeros(npops,1); + for j=1:npops + indsj = logical(partition==j); + [h(j), p(j), k(j)] = kstest2(zscore(changesInLogml(j,indsi)), zscore(changesInLogml(i,indsj))); + [h1(j), p1(j), k1(j)] = kstest2(zscore(changesInLogml(j,indsi)), zscore(changesInLogml(i,indsj)),0.05,'larger'); + [h2(j), p2(j), k2(j)] = kstest2(zscore(changesInLogml(j,indsi)), zscore(changesInLogml(i,indsj)),0.05,'smaller'); + + + if k(j)==k2(j) + k(j) = -k(j); + + end + % [h2(j),p2(j),k2(j)] = kstest2(changesInLogml(j,indsi), changesInLogml(i,indsj),0.05,'smaller'); + % nonzero(i,j) = k1(j)+k2(j); + % nonzero(i,j) = + % (k(j)*(h1(j)-0.5))*(mean(changesInLogml(j,indsi)-mean(changesInLogml(i,indsj)))); + nonzero(i,j) = k(j); + rivi = [rivi ' ' num2str(nonzero(i,j),'%5.4f') '(' num2str(p(j),'%5.4f') ')']; + end + disp(rivi); + + % sibling = setdiff(find(h==0),i); + + % parent = find(p==min(p)); +% fprintf(1,'Cluster %d ',i); +% fprintf(1,'%s(%f) ',mat2str(parent),min(p)); +% +% offspring = find(p2==min(p2)); +% fprintf(1,'%s(%f)\n',mat2str(offspring), min(p2)); +end + +% groupnames{npops+1} = sprintf('All clusters'); +% %waitALittle; +% [s1,v1] = listdlg('PromptString','Select one source cluster:',... +% 'SelectionMode','single',... +% 'Name','Select source cluster',... +% 'ListString',groupnames); +% if isempty(s1) || ~v1 +% disp('*** WARNING: Viewing loglikelihood cancelled.'); +% return +% elseif s1==npops+1 +% view_all_density(changesInLogml, partition, filename); +% else +% remain_pop = logical((1:npops)~=s1); +% %waitALittle; +% [s2,v2] = listdlg('PromptString', 'Select target clusters:',... +% 'SelectionMode','multiple',... +% 'Name','Select target cluster',... +% 'ListString',groupnames(remain_pop)); +% if isempty(s2) || ~v2 +% disp('*** WARNING: Viewing loglikelihood cancelled.'); +% return +% end +% inds = logical(partition==s1); +% ninds = sum(inds); % individuals in the source cluster +% remain = find(remain_pop); +% fprintf('Source cluster: %d\n', s1); +% fprintf('Number of strains: %d\n',ninds); +% fprintf('Target cluster(s): %s\n', num2str(remain(s2))); +% +% f = zeros(length(s2),100); +% xi = zeros(length(s2),100); +% for i = 1:length(s2) +% [f(i,:),xi(i,:)] = ksdensity_myown(changesInLogml(remain(s2(i)),inds)'); +% end +% +% map = giveColors(npops); +% h0 = figure('NumberTitle', 'off'); % density plot; +% set(h0,'Tag','density_plot'); +% set(h0,'Name',['Density of log likelihood changes - ' filename]); +% set(gca,'ColorOrder',map(remain(s2),:)); +% hold on; +% plot(xi',f'); +% set(gca,'xlim',[min(min(xi)),max(max(xi))]); +% legend(groupnames(remain(s2))); +% xlabel('Change of log likelihood'); +% ylabel('Estimated density'); +% title(sprintf('Cluster %d',s1)); +% +% % h1 = figure('NumberTitle', 'off'); % histogram plot; +% % set(h1,'Tag','histogram_plot'); +% % set(h1,'Name',['Histogram of log likelihood changes - ' filename]); +% % hist(changesInLogml(remain(s2),inds)'); +% % colormap(map(remain(s2),:)); +% % % histfit(changesInLogml(remain(s2),inds)'); % need statistical toolbox +% % legend(groupnames(remain(s2))); +% % xlabel('Change of log likelihood'); +% % ylabel('Frequency'); +% % title(sprintf('Cluster %d',s1)); +% % rose(changesInLogml(remain(s2),inds)); +% end + + +%--------------------------------------% +%%% functions for linkage model %%% +%--------------------------------------- + +%-------------------------------------------------------------------------- + +function changes = computeChanges(cq_counts, cq_sumcounts,... + sp_counts, sp_sumcounts,... + partition, pop_logml,... + ind, adjprior_cq, adjprior_sp, ... + indCqCounts, indSpCounts) +% Computes changes in log-marginal likelihood if individual ind is +% moved to another population +% +% Input: +% ind - the individual to be moved +% adjprior_cq & _sp - adjpriors for cliques and separators +% indCqCounts, indSpCounts - counts for individual ind +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% ind is move to population i. + +npops = size(cq_counts,3); +changes = zeros(npops,1); + +i1 = partition(ind); +i1_logml = pop_logml(i1); +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +cq_counts(:,:,i1) = cq_counts(:,:,i1)-indCqCounts; +cq_sumcounts(i1,:) = cq_sumcounts(i1,:)-sumCq; +sp_counts(:,:,i1) = sp_counts(:,:,i1)-indSpCounts; +sp_sumcounts(i1,:) = sp_sumcounts(i1,:)-sumSp; + +new_i1_logml = computePopulationLogml(double(cq_counts), double(cq_sumcounts),... + double(sp_counts), double(sp_sumcounts),... + i1, adjprior_cq, adjprior_sp); + +cq_counts(:,:,i1) = cq_counts(:,:,i1)+indCqCounts; +cq_sumcounts(i1,:) = cq_sumcounts(i1,:)+sumCq; +sp_counts(:,:,i1) = sp_counts(:,:,i1)+indSpCounts; +sp_sumcounts(i1,:) = sp_sumcounts(i1,:)+sumSp; + + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = pop_logml(i2); + +cq_counts(:,:,i2) = cq_counts(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); +cq_sumcounts(i2,:) = cq_sumcounts(i2,:)+repmat(sumCq,[npops-1 1]); +sp_counts(:,:,i2) = sp_counts(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); +sp_sumcounts(i2,:) = sp_sumcounts(i2,:) + repmat(sumSp,[npops-1 1]); + +new_i2_logml = computePopulationLogml(double(cq_counts), double(cq_sumcounts),... + double(sp_counts), double(sp_sumcounts),... + i2, adjprior_cq, adjprior_sp); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + +%-------------------------------------------------------------------------- + +function popLogml = computePopulationLogml(cq_counts, cq_sumcounts,... + sp_counts, sp_sumcounts,... + pops, adjprior_cq, adjprior_sp) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + + +nall_cq = size(cq_counts,1); +nall_sp = size(sp_counts, 1); +ncliq = size(cq_counts,2); +nsep = size(sp_counts, 2); + +z = length(pops); + +popLogml_cq = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_cq,[1 1 length(pops)]) + cq_counts(:,:,pops)) ... + ,[nall_cq ncliq z]),1),2)) - sum(gammaln(1+cq_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_cq))); + +popLogml_sp = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_sp,[1 1 length(pops)]) + sp_counts(:,:,pops)) ... + ,[nall_sp nsep z]),1),2)) - sum(gammaln(1+sp_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_sp))); + +popLogml = popLogml_cq - popLogml_sp; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%-------------------------------------------------------------------------- + + +%--------------------------------------% +%%% functions for independence model %%% +%--------------------------------------- +%-------------------------------------------------------------------------- +function [muutokset, diffInCounts] = ... + laskeMuutokset(ind, rowsFromInd, data, adjprior, priorTerm, ... + COUNTS, SUMCOUNTS, PARTITION, POP_LOGML) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?ind siirretään koriin i. +% diffInCounts on poistettava COUNTS:in siivusta i1 ja lisättäv? +% COUNTS:in siivuun i2, mikäli muutos toteutetaan. + +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +i1 = PARTITION(ind); +i1_logml = POP_LOGML(i1); + +rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml2(i1, adjprior, priorTerm,COUNTS, SUMCOUNTS); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); +new_i2_logml = computePopulationLogml2(i2, adjprior, priorTerm,COUNTS, SUMCOUNTS); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + +%-------------------------------------------------------------------------- +function popLogml = computePopulationLogml2(pops, adjprior, priorTerm, ... + COUNTS, SUMCOUNTS) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +x = size(COUNTS,1); +y = size(COUNTS,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS(pops,:)),2) - priorTerm; + + +%-------------------------------------------------------------------------- +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n +% riveill?rows. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + + diff --git a/matlab/graph/view_geneflow.m b/matlab/graph/view_geneflow.m new file mode 100644 index 0000000..21e8631 --- /dev/null +++ b/matlab/graph/view_geneflow.m @@ -0,0 +1,183 @@ +function view_geneflow + +%waitALittle; +% [filename, pathname] = uigetfile('*.mat', 'Load admixture results.'); +% if (sum(filename)==0) || (sum(pathname)==0) +% return; +% end + +h0 = findobj('Tag','load_menu'); +c = get(h0,'UserData'); +h0 = findobj('Tag','filename1_text'); +filename = get(h0,'String'); + +% struct_array = load([pathname filename]); +disp('---------------------------------------------------'); +disp('Viewing the gene flow.'); +disp(['Load the admixture result from: ',[filename],'...']); +% if isfield(struct_array,'c') %Matlab versio +% c = struct_array.c; +% if ~isfield(c,'proportionsIt') +% disp('*** ERROR: Incorrect file format'); +% return +% end +% elseif isfield(struct_array,'proportionsIt') %Mideva versio +% c = struct_array; +% if ~isfield(c,'proportionsIt') +% disp('*** ERROR: Incorrect file format'); +% return +% end +% else +% disp('*** ERROR: Incorrect file format'); +% return; +% end + +proportionsIt = c.proportionsIt; +popnames = c.popnames; partition = c.PARTITION; + +% if isempty(popnames) || size(popnames,1)==size(partition,1) % bacteria/spatial + if isempty(popnames) + ninds = size(partition,1); + popnames=cell(ninds,2); + for ind=1:ninds + popnames{ind,1}=cellstr(num2str(ind)); + end + popnames(:,2)=num2cell((1:ninds)'); + end + + npops = c.npops; + if isfield(c,'admixnpops') + admixnpops = c.admixnpops; + else % if the admixture result is based on pre-defined partitions. + admixnpops = npops; + end + + if ~isfield(c,'pvalue') % compatiable with old data + disp('*** WARNING: Old admixture format detected.'); + disp('*** WARNING: pvalue is not found in the admixture result.'); + disp('*** WARNING: all the admixture will be significant.'); + pvalue = ones(size(partition,1),1); + else + pvalue = c.pvalue; + end + + removed_inds = logical(~any(proportionsIt,2)); + outliers = unique(partition(removed_inds)); + total_clusters = [1:npops]; + if ~isempty(outliers) + fprintf('Outlier clusters: '); + for i = 1:length(outliers) + fprintf(['%s' blanks(2)], num2str(outliers(i))); + end + fprintf('\n'); + inliers = setdiff(total_clusters, outliers); + else + inliers = total_clusters; + end + + if length(inliers)~=admixnpops + disp('*** ERROR: in view linkage admixture.'); + return + end + + groupnames = cell(1,admixnpops); + for i=1:admixnpops + groupnames{i} = sprintf('Cluster %d',inliers(i)); + end + + %waitALittle; + answer = inputdlg( ['Input the p value for the strain admixture. Strains with '... + 'nonsignificant admixture will be displayed as a single color bar'],... + 'Input the upper bound p value',1,{'0.05'}); + if isempty(answer) % cancel has been pressed + return + else + maxp = str2num(answer{1}); + fprintf('P value: %4.2f\n', maxp); + + % nonsignificant strains are forced to be single color bar. + nonsignificant_ix = find( (pvalue>maxp & pvalue <=1) ); + if ~isempty(nonsignificant_ix) + for i = nonsignificant_ix' + incluster = logical(proportionsIt(i,:)==max(proportionsIt(i,:))); + proportionsIt(i,:) = zeros(1, admixnpops); + proportionsIt(i,incluster) = 1; + end + end + + ix = find(ismember(partition(:),inliers)); + proportionsIt = proportionsIt(ix,:); + partition = partition(ix); + end + + % Calculate the gene flow weight. + % W[j,i]: the gene flow from cluster j to i. + W = ones(admixnpops,admixnpops); + for i=1:admixnpops + inds = logical(partition==inliers(i)); + ninds = sum(inds); + for j = 1:admixnpops + W(j,i) = sum(proportionsIt(inds,j))/ninds; + end + end + disp('Gene flow matrix (row=source col=target): '); + ekarivi = 'Cluster '; + for i = 1:admixnpops + ekarivi = [ekarivi ownNum2Str(inliers(i)) blanks(8-floor(log10(i)))]; + end + disp(ekarivi); + for i = 1:admixnpops + rivi = [blanks(4-floor(log10(i))) num2str(inliers(i)) ':']; + for j = 1:admixnpops + rivi = [rivi ' ' num2str(omaRound(W(i,j)),'%5.4f')]; + end + disp(rivi); + end + + disp('Generating the gene flow graph. Please wait...'); + % Show graph +% %waitALittle; +% talle = questdlg(['Do you want to view the gene flow graph?' ... +% ], 'Visualization?', 'Yes', 'No', 'Yes'); +% if isequal(talle,'No') +% return +% else + h0 = findobj('Tag','geneflow_menu'); + graphviz_path = get(h0,'Userdata'); + if isempty(graphviz_path) + %waitALittle; + graphviz_path = uigetdir('','Specify the location of dot.exe in the GraphViz package: '); + if graphviz_path == 0 + return; + else + h0 = findobj('Tag','geneflow_menu'); + set(h0,'Userdata',graphviz_path); + end + end + % store the current directory. + d = cd; + try + graphvis2(W,filename,inliers,graphviz_path,npops); + disp('Finished.'); + catch + disp('*** ERROR: dot.exe was not found.'); + h_errdlg = errordlg('dot.exe was not found in the specified path.'); + set(h_errdlg,'WindowStyle','Modal') + set(h0,'Userdata',[]); + end + cd(d); +% end +% else +% disp('This module is under developpment.'); +% end + +% ------------------------------------------------------------------------- +function num2 = omaRound(num) +% Pyöristää luvun num 1 desimaalin tarkkuuteen +num = num*10000; +num = round(num); +num2 = num/10000; + + + + diff --git a/matlab/graph/view_loglikelihood.m b/matlab/graph/view_loglikelihood.m new file mode 100644 index 0000000..9c0c1dd --- /dev/null +++ b/matlab/graph/view_loglikelihood.m @@ -0,0 +1,476 @@ +function view_loglikelihood +%waitALittle; +% [filename1, pathname1] = uigetfile('*.mat', 'Load mixture results.'); +% if (sum(filename1)==0) || (sum(pathname1)==0) +% return; +% end +% struct_array = load([pathname1 filename1]); + +h0 = findobj('Tag','load_menu'); +c = get(h0,'UserData'); +h0 = findobj('Tag','filename1_text'); +filename1 = get(h0,'String'); +disp('---------------------------------------------------'); +disp('Viewing the change of log likelihood.'); +disp(['Load the mixture result from: ',[filename1],'...']); + + if isfield(c,'c') %Matlab versio + c = c.c; + if ~isfield(c,'mixtureType') + disp('*** ERROR: Incorrect file format'); + return + end + %else + % disp('*** ERROR: Incorrect file format'); + % return; + end +if ~isfield(c,'changesInLogml') + disp('*** WARNING: Old mixture format detected.'); + if 0 % Disabled 12/06/07, Jukka Siren + switch c.mixtureType + case 'linkage_mix' + %waitALittle; + [filename, pathname] = uigetfile('*.mat', 'Load the corresponding preprocessed data.'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + struct_array = load([pathname filename]); + disp('The corresponding preprocessed data is needed.'); + disp(['Load the preprocessed data from: ',[pathname filename], '...']); + if isfield(struct_array,'c') + c_raw = struct_array.c; + if ~all(size(c_raw.adjprior)==size(c.adjprior)) ||... + ~all(size(c_raw.data)==size(c.data)) + disp('Mixture result and preprocessed data do not match.'); + return; + end + + if ~isfield(c_raw,'linkage_model') + model_entry = input('Specify the linkage model(1-linear 2-codon):'); + switch model_entry, + case 1 + linkage_model = 'linear'; + case 2 + linkage_model = 'codon'; + end + else + linkage_model = c_raw.linkage_model; + end + else + disp('*** ERROR: Incorrect file format'); + return; + end + disp('Start calculating log likelihood. Please wait...'); + npops = c.npops; + partition = c.PARTITION; + cq_counts = c.CQ_COUNTS; + cq_sumcounts = c.CQ_SUMCOUNTS; + sp_counts = c.SP_COUNTS; + sp_sumcounts = c.SP_SUMCOUNTS; + + % Shrink the raw data + noalle_cq = size(c.adjprior_cq,1); + informative_cq = logical(sum(c.adjprior_cq)~=noalle_cq); + noalle_sp = size(c.adjprior_sp,1); + informative_sp = logical(sum(c.adjprior_sp)~=noalle_sp); + cq_counts = uint16(cq_counts(:,informative_cq,:)); + cq_sumcounts = uint16(cq_sumcounts(:,informative_cq)); + sp_counts = uint16(sp_counts(:,informative_sp,:)); + sp_sumcounts = uint16(sp_sumcounts(:,informative_sp)); + + data = c_raw.data; + ninds = size(data,1); + component_mat = c_raw.component_mat; + clear c_raw; + index = data(:,end); + [data_clique, data_separator, noalle_clique, noalle_separator] = ... + transform4(data, component_mat, linkage_model); + + if length(noalle_clique)~=length(find(informative_cq)) + disp('*** ERROR: The linkage model is not consistent with the data.'); + return + end + + data_clique = [data_clique index]; + data_separator = [data_separator index]; + + [counts_cq, nalleles_cq, prior_cq, adjprior_cq]... + = allfreqsnew2(data_clique, double(noalle_clique)); + clear data_clique; + [counts_sp, nalleles_sp, prior_sp, adjprior_sp]... + = allfreqsnew2(data_separator, double(noalle_separator)); + clear data_separator; + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + + pop_logml = computePopulationLogml(double(cq_counts), double(cq_sumcounts),... + double(sp_counts), double(sp_sumcounts),... + [1:npops], adjprior_cq, adjprior_sp); + + changesInLogml = zeros(npops,ninds); + for ind = 1:ninds + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + changesInLogml(:,ind) = computeChanges(cq_counts, cq_sumcounts,... + sp_counts, sp_sumcounts,... + partition, pop_logml,... + ind, adjprior_cq, ... + adjprior_sp, indCqCounts, indSpCounts); + end + case 'mix' % Independence model. + %waitALittle; + [filename, pathname] = uigetfile('*.mat', 'Load the corresponding preprocessed data.'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + struct_array = load([pathname filename]); + disp('The corresponding preprocessed data is needed.'); + disp(['Load the preprocessed data from: ',[pathname filename], '...']); + if isfield(struct_array,'c') + c_raw = struct_array.c; + if ~all(size(c_raw.adjprior)==size(c.adjprior)) ||... + ~all(size(c_raw.data(:,[1:end-1]))==size(c.data)) + disp('Mixture result and preprocessed data do not match.'); + return; + end + else + disp('*** ERROR: Incorrect file format'); + return; + end + + + npops = c.npops; + pops = 1:npops; + rowsFromInd = c.rowsFromInd; + data = c.data; + ninds = size(data,1)/rowsFromInd; + adjprior = c.adjprior; + priorTerm = c_raw.priorTerm; + COUNTS = c.COUNTS; + SUMCOUNTS = c.SUMCOUNTS; + PARTITION = c.PARTITION; + POP_LOGML = computePopulationLogml2(pops, adjprior, priorTerm, ... + COUNTS, SUMCOUNTS); + changesInLogml = zeros(npops,ninds); + for ind = 1:ninds + [muutokset, diffInCounts] = laskeMuutokset(ind, rowsFromInd, data, ... + adjprior, priorTerm, COUNTS, SUMCOUNTS, PARTITION, POP_LOGML); + changesInLogml(:,ind) = muutokset; + end + otherwise + disp('This model is under construction.'); + return + end + + + c.changesInLogml = changesInLogml; + + fprintf(1,'Saving the result...') + %waitALittle; + save_preproc = questdlg('Do you wish to save the updated mixture result?',... + 'Save mixture results?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %waitALittle; + [filename, pathname] = uiputfile('*.mat','Save mixture result as'); + if isempty(filename) && isempty(pathname) + return; + else + kokonimi = [pathname filename]; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + filename1 = filename; + end + + end; + fprintf(1,'Finished.\n'); + end + return +end + +if isequal(c.mixtureType,'spatialPop') || isequal(c.mixtureType,'popMix') + % Group level clustering, using the correct partition + view_density(c.changesInLogml, c.groupPartition, filename1); +else + view_density(c.changesInLogml, c.PARTITION, filename1); +end + +% ------------------------------------------------------------------------- +function view_all_density(changesInLogml, partition, filename) +npops = size(changesInLogml,1); +groupnames = cell(1,npops); +for i=1:npops + groupnames{i} = sprintf('Cluster %d',i); +end +nrows = npops*(npops-1); +f = zeros(nrows,100); +xi = zeros(nrows,100); +map = giveColors(npops); +h = zeros(npops,1); + +k = 1; +for i = 1:npops + inds = logical(partition==i); + % ninds = sum(inds); % individuals in the source cluster + target_pop = find(logical((1:npops)~=i)); + for j = target_pop + [f(k,:),xi(k,:)] = ksdensity_myown(changesInLogml(j,inds)'); + k = k+1; + end + h(i) = figure('NumberTitle', 'off', 'menubar','none','toolbar','figure'); % density plot; + set(h(i),'Tag','density_plot'); + set(h(i),'Name',sprintf('%d Density of log likelihood changes - %s',i, filename)); + set(gca,'ColorOrder',map(target_pop,:)); + hold on; + plot(xi([(i-1)*(npops-1)+1:i*(npops-1)],:)',f([(i-1)*(npops-1)+1:i*(npops-1)],:)'); + + legend(groupnames(target_pop)); + xlabel('Change of log likelihood'); + ylabel('Estimated density'); + title(sprintf('Cluster %d',i)); +end + +x_min = min(min(xi)); +x_max = max(max(xi)); +for i = 1:npops + set(get(h(i),'CurrentAxes'),'xlim',[x_min, x_max]); +end + +% ------------------------------------------------------------------------- +function view_density(changesInLogml, partition, filename) +npops = size(changesInLogml,1); +groupnames = cell(1,npops+1); +for i=1:npops + groupnames{i} = sprintf('Population %d',i); +end +groupnames{npops+1} = sprintf('All clusters'); +%waitALittle; +[s1,v1] = listdlg('PromptString','Select one source cluster:',... + 'SelectionMode','single',... + 'Name','Select source cluster',... + 'ListString',groupnames); +if isempty(s1) || ~v1 + disp('*** WARNING: Viewing loglikelihood cancelled.'); + return +elseif s1==npops+1 + view_all_density(changesInLogml, partition, filename); +else + remain_pop = logical((1:npops)~=s1); + %waitALittle; + [s2,v2] = listdlg('PromptString', 'Select target clusters:',... + 'SelectionMode','multiple',... + 'Name','Select target cluster',... + 'ListString',groupnames(remain_pop)); + if isempty(s2) || ~v2 + disp('*** WARNING: Viewing loglikelihood cancelled.'); + return + end + inds = logical(partition==s1); + ninds = sum(inds); % individuals in the source cluster + remain = find(remain_pop); + fprintf('Source cluster: %d\n', s1); + fprintf('Number of strains: %d\n',ninds); + fprintf('Target cluster(s): %s\n', num2str(remain(s2))); + + f = zeros(length(s2),100); + xi = zeros(length(s2),100); + for i = 1:length(s2) + [f(i,:),xi(i,:)] = ksdensity_myown(-changesInLogml(remain(s2(i)),inds)'); + end + + map = giveColors(npops); + h0 = figure('NumberTitle', 'off'); % density plot; + set(h0,'Tag','density_plot'); + set(h0,'Name',['Density of log likelihood changes - ' filename]); + set(gca,'ColorOrder',map(remain(s2),:)); + hold on; + plot(xi',f'); + set(gca,'xlim',[min(min(xi)),max(max(xi))]); + legend(groupnames(remain(s2))); + xlabel('Change of log likelihood'); + ylabel('Estimated density'); + title(sprintf('Population %d',s1)); + + % h1 = figure('NumberTitle', 'off'); % histogram plot; + % set(h1,'Tag','histogram_plot'); + % set(h1,'Name',['Histogram of log likelihood changes - ' filename]); + % hist(changesInLogml(remain(s2),inds)'); + % colormap(map(remain(s2),:)); + % % histfit(changesInLogml(remain(s2),inds)'); % need statistical toolbox + % legend(groupnames(remain(s2))); + % xlabel('Change of log likelihood'); + % ylabel('Frequency'); + % title(sprintf('Cluster %d',s1)); + % rose(changesInLogml(remain(s2),inds)); +end + + +%--------------------------------------% +%%% functions for linkage model %%% +%--------------------------------------- + +%-------------------------------------------------------------------------- + +function changes = computeChanges(cq_counts, cq_sumcounts,... + sp_counts, sp_sumcounts,... + partition, pop_logml,... + ind, adjprior_cq, adjprior_sp, ... + indCqCounts, indSpCounts) +% Computes changes in log-marginal likelihood if individual ind is +% moved to another population +% +% Input: +% ind - the individual to be moved +% adjprior_cq & _sp - adjpriors for cliques and separators +% indCqCounts, indSpCounts - counts for individual ind +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% ind is move to population i. + +npops = size(cq_counts,3); +changes = zeros(npops,1); + +i1 = partition(ind); +i1_logml = pop_logml(i1); +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +cq_counts(:,:,i1) = cq_counts(:,:,i1)-indCqCounts; +cq_sumcounts(i1,:) = cq_sumcounts(i1,:)-sumCq; +sp_counts(:,:,i1) = sp_counts(:,:,i1)-indSpCounts; +sp_sumcounts(i1,:) = sp_sumcounts(i1,:)-sumSp; + +new_i1_logml = computePopulationLogml(double(cq_counts), double(cq_sumcounts),... + double(sp_counts), double(sp_sumcounts),... + i1, adjprior_cq, adjprior_sp); + +cq_counts(:,:,i1) = cq_counts(:,:,i1)+indCqCounts; +cq_sumcounts(i1,:) = cq_sumcounts(i1,:)+sumCq; +sp_counts(:,:,i1) = sp_counts(:,:,i1)+indSpCounts; +sp_sumcounts(i1,:) = sp_sumcounts(i1,:)+sumSp; + + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = pop_logml(i2); + +cq_counts(:,:,i2) = cq_counts(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); +cq_sumcounts(i2,:) = cq_sumcounts(i2,:)+repmat(sumCq,[npops-1 1]); +sp_counts(:,:,i2) = sp_counts(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); +sp_sumcounts(i2,:) = sp_sumcounts(i2,:) + repmat(sumSp,[npops-1 1]); + +new_i2_logml = computePopulationLogml(double(cq_counts), double(cq_sumcounts),... + double(sp_counts), double(sp_sumcounts),... + i2, adjprior_cq, adjprior_sp); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + +%-------------------------------------------------------------------------- + +function popLogml = computePopulationLogml(cq_counts, cq_sumcounts,... + sp_counts, sp_sumcounts,... + pops, adjprior_cq, adjprior_sp) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + + +nall_cq = size(cq_counts,1); +nall_sp = size(sp_counts, 1); +ncliq = size(cq_counts,2); +nsep = size(sp_counts, 2); + +z = length(pops); + +popLogml_cq = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_cq,[1 1 length(pops)]) + cq_counts(:,:,pops)) ... + ,[nall_cq ncliq z]),1),2)) - sum(gammaln(1+cq_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_cq))); + +popLogml_sp = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_sp,[1 1 length(pops)]) + sp_counts(:,:,pops)) ... + ,[nall_sp nsep z]),1),2)) - sum(gammaln(1+sp_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_sp))); + +popLogml = popLogml_cq - popLogml_sp; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%-------------------------------------------------------------------------- + + +%--------------------------------------% +%%% functions for independence model %%% +%--------------------------------------- +%-------------------------------------------------------------------------- +function [muutokset, diffInCounts] = ... + laskeMuutokset(ind, rowsFromInd, data, adjprior, priorTerm, ... + COUNTS, SUMCOUNTS, PARTITION, POP_LOGML) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?ind siirretään koriin i. +% diffInCounts on poistettava COUNTS:in siivusta i1 ja lisättäv? +% COUNTS:in siivuun i2, mikäli muutos toteutetaan. + +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +i1 = PARTITION(ind); +i1_logml = POP_LOGML(i1); + +rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml2(i1, adjprior, priorTerm,COUNTS, SUMCOUNTS); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); +new_i2_logml = computePopulationLogml2(i2, adjprior, priorTerm,COUNTS, SUMCOUNTS); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + +%-------------------------------------------------------------------------- +function popLogml = computePopulationLogml2(pops, adjprior, priorTerm, ... + COUNTS, SUMCOUNTS) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +x = size(COUNTS,1); +y = size(COUNTS,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS(pops,:)),2) - priorTerm; + + +%-------------------------------------------------------------------------- +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n +% riveill?rows. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + diff --git a/matlab/graph/vorPlot.m b/matlab/graph/vorPlot.m new file mode 100644 index 0000000..58c88f5 --- /dev/null +++ b/matlab/graph/vorPlot.m @@ -0,0 +1,88 @@ +function vorPlot(V,C,partition, pointers, coordinates, tekstit) + +if nargin < 6 + tekstit = pointers; +end + +notEmptyPops = length(unique(partition)); +if notEmptyPops>30 + disp(['Number of populations: ' num2str(notEmptyPops)]); + disp(' '); + disp('Figure can be drawn only if the number of populations'); + disp('is less or equal to 30.'); + disp(' '); + return; +end + +npops = length(unique(partition)); + +if npops > 30 + return +end + +colors=giveColors(npops); + +h1 = figure('NumberTitle', 'off', 'Name', 'Colored Voronoi tessellation'); +hold on + + +[I, J] = find(coordinates>0 | coordinates<0); +I=unique(I); +xmin = min(coordinates(I,1)); +xmax = max(coordinates(I,1)); +xdiff = (xmax-xmin); +xmean = xmin + xdiff/2; + +ymin = min(coordinates(I,2)); +ymax = max(coordinates(I,2)); +ydiff = (ymax-ymin); +ymean = ymin + ydiff/2; + +pituus = max(ydiff,xdiff)*1.1/2; + +axis([xmean-pituus xmean+pituus ymean-pituus ymean+pituus]); + + +for i=1:length(C) + X=V(C{i},:); + %k=convhull(X(:,1),X(:,2),{'QJ', 'Pp'}); + k=convhull(X(:,1),X(:,2)); + taulu = pointers{i}; + if length(taulu)>0 + color=colors(partition(taulu(1)),:); + patch(X(k,1),X(k,2),color); + plot(coordinates(taulu(1),1),coordinates(taulu(1),2),'Color',[1 1 1], 'MarkerSize', 50); + %text(coordinates(taulu(1),1),coordinates(taulu(1),2),num2str(taulu), 'FontSize', 8); + end +end + +if ~isequal(tekstit, -1) + for i=1:length(pointers) + taulu = pointers{i}; + teksti = tekstit{i}; + if isnumeric(teksti) + teksti = num2str(teksti); + end + if length(taulu)>0 + text(coordinates(taulu(1),1),coordinates(taulu(1),2),teksti, ... + 'Interpreter', 'none', 'FontSize', 8); + end + end +end +%[I,J] = find(coordinates(:,1) > 0); + +%plot(coordinates(I,1), coordinates(I,2), 'k.'); + +%hold off +%{ +%h0 = image_figure; +%hold on; +for i=1:length(partition) + if coordinates(i,1)>=0 + %plot(coordinates(i,1),coordinates(i,2),'k.'); + plot(coordinates(i,1),coordinates(i,2),'Color',[1 1 1], 'MarkerSize', 40); + text(coordinates(i,1),coordinates(i,2),num2str(i), 'FontSize', 8); + end +end +%} +hold off; \ No newline at end of file diff --git a/matlab/graph/waitALittle.m b/matlab/graph/waitALittle.m new file mode 100644 index 0000000..2d2571b --- /dev/null +++ b/matlab/graph/waitALittle.m @@ -0,0 +1,3 @@ +function waitALittle +A = rand(500); +gammaln(A); \ No newline at end of file diff --git a/matlab/graph/winontop.m b/matlab/graph/winontop.m new file mode 100644 index 0000000..5c19f37 --- /dev/null +++ b/matlab/graph/winontop.m @@ -0,0 +1,8 @@ +%WINONTOP +% WINONTOP(FH) set "topmost" property of the figure specified by handle +% FH to state "on". +% +% WINONTOP(FH,1) same as WINONTOP(FH) +% +% WINONTOP(FH,0) set "topmost" property of the figure specified by handle +% FH to state "off". \ No newline at end of file diff --git a/matlab/independent/greedyMix.m b/matlab/independent/greedyMix.m new file mode 100644 index 0000000..2bc5cdf --- /dev/null +++ b/matlab/independent/greedyMix.m @@ -0,0 +1,1788 @@ +function greedyMix(tietue) + +% check whether fixed k mode is selected +h0 = findobj('Tag','fixk_menu'); +fixedK = get(h0, 'userdata'); + +if fixedK + if ~(fixKWarning == 1) % call function fixKWarning + return + end +end + +% check whether partition compare mode is selected +h1 = findobj('Tag','partitioncompare_menu'); +partitionCompare = get(h1, 'userdata'); + + +if isequal(tietue,-1) + + input_type = questdlg('Specify the format of your data: ',... + 'Specify Data Format', ... + 'BAPS-format', 'GenePop-format', 'Preprocessed data', ... + 'BAPS-format'); + + switch input_type + + case 'BAPS-format' + waitALittle; + [filename, pathname] = uigetfile('*.txt', 'Load data in BAPS-format'); + if filename==0 + return; + end + + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname filename]); + end + + data = load([pathname filename]); + ninds = testaaOnkoKunnollinenBapsData(data); %TESTAUS + if (ninds==0) + disp('Incorrect Data-file.'); + return; + end + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + + input_pops = questdlg(['When using data which are in BAPS-format, '... + 'you can specify the sampling populations of the individuals by '... + 'giving two additional files: one containing the names of the '... + 'populations, the other containing the indices of the first '... + 'individuals of the populations. Do you wish to specify the '... + 'sampling populations?'], ... + 'Specify sampling populations?',... + 'Yes', 'No', 'No'); + if isequal(input_pops,'Yes') + waitALittle; + [namefile, namepath] = uigetfile('*.txt', 'Load population names'); + if namefile==0 + kysyToinen = 0; + else + kysyToinen = 1; + end + if kysyToinen==1 + waitALittle; + [indicesfile, indicespath] = uigetfile('*.txt', 'Load population indices'); + if indicesfile==0 + popnames = []; + else + popnames = initPopNames([namepath namefile],[indicespath indicesfile]); + end + else + popnames = []; + end + else + popnames = []; + end + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + [Z,dist] = newGetDistances(data,rowsFromInd); + + save_preproc = questdlg('Do you wish to save pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + kokonimi = [pathname filename]; + c.data = data; c.rowsFromInd = rowsFromInd; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.dist = dist; c.popnames = popnames; c.Z = Z; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + clear c; + end; + + case 'GenePop-format' + waitALittle; + [filename, pathname] = uigetfile('*.txt', 'Load data in GenePop-format'); + if filename==0 + return; + end + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname filename]); + end + kunnossa = testaaGenePopData([pathname filename]); + if kunnossa==0 + return + end + [data,popnames]=lueGenePopData([pathname filename]); + + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + [Z,dist] = newGetDistances(data,rowsFromInd); + save_preproc = questdlg('Do you wish to save pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + kokonimi = [pathname filename]; + c.data = data; c.rowsFromInd = rowsFromInd; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.dist = dist; c.popnames = popnames; c.Z = Z; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + clear c; + end; + + case 'Preprocessed data' + waitALittle; + [filename, pathname] = uigetfile('*.mat', 'Load pre-processed data'); + if filename==0 + return; + end + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname filename]); + end + + struct_array = load([pathname filename]); + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'dist') + disp('Incorrect file format'); + return + end + elseif isfield(struct_array,'dist') %Mideva versio + c = struct_array; + else + disp('Incorrect file format'); + return; + end + data = double(c.data); rowsFromInd = c.rowsFromInd; alleleCodes = c.alleleCodes; + noalle = c.noalle; adjprior = c.adjprior; priorTerm = c.priorTerm; + dist = c.dist; popnames = c.popnames; Z = c.Z; + clear c; + otherwise + return + end + +else + data = double(tietue.data); rowsFromInd = tietue.rowsFromInd; alleleCodes = tietue.alleleCodes; + noalle = tietue.noalle; adjprior = tietue.adjprior; priorTerm = tietue.priorTerm; + dist = tietue.dist; popnames = tietue.popnames; Z = tietue.Z; + clear tietue; +end + + +global PARTITION; global COUNTS; +global SUMCOUNTS; global POP_LOGML; +clearGlobalVars; + +c.data=data; +c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; +c.dist=dist; c.Z=Z; c.rowsFromInd = rowsFromInd; + +ninds = length(unique(data(:,end))); +ekat = (1:rowsFromInd:ninds*rowsFromInd)'; +c.rows = [ekat ekat+rowsFromInd-1]; + +% partition compare +if ~isempty(partitionCompare) + nsamplingunits = size(c.rows,1); + partitions = partitionCompare.partitions; + npartitions = size(partitions,2); + partitionLogml = zeros(1,npartitions); + for i = 1:npartitions + % number of unique partition lables + npops = length(unique(partitions(:,i))); + + partitionInd = zeros(ninds*rowsFromInd,1); + partitionSample = partitions(:,i); + for j = 1:nsamplingunits + partitionInd([c.rows(j,1):c.rows(j,2)]) = partitionSample(j); + end + partitionLogml(i) = ... + initialCounts(partitionInd, data(:,1:end-1), npops, c.rows, noalle, adjprior); + + end + % return the logml result + partitionCompare.logmls = partitionLogml; + set(h1, 'userdata', partitionCompare); + return +end + +if fixedK + [logml, npops, partitionSummary]=indMix_fixK(c); +else + [logml, npops, partitionSummary]=indMix(c); +end + +if logml==1 + return +end + +data = data(:,1:end-1); + +h0 = findobj('Tag','filename1_text'); inp = get(h0,'String'); +h0 = findobj('Tag','filename2_text'); +outp = get(h0,'String'); +changesInLogml = writeMixtureInfo(logml, rowsFromInd, data, adjprior, priorTerm, ... + outp,inp,partitionSummary, popnames, fixedK); + +viewMixPartition(PARTITION, popnames); + +talle = questdlg(['Do you want to save the mixture populations ' ... + 'so that you can use them later in admixture analysis?'], ... + 'Save results?','Yes','No','Yes'); +if isequal(talle,'Yes') + waitALittle; + [filename, pathname] = uiputfile('*.mat','Save results as'); + + % ------------------------------------------- + % Added by Jing, 26.12.2005 + if (sum(filename)==0) || (sum(pathname)==0) + % Cancel was pressed + return; + else + % copy 'baps4_output.baps' into the text file with the same name. + if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',[pathname filename '.txt']) + delete('baps4_output.baps') + end + end; + % ------------------------------------------- + + c.PARTITION = PARTITION; c.COUNTS = COUNTS; c.SUMCOUNTS = SUMCOUNTS; + c.alleleCodes = alleleCodes; c.adjprior = adjprior; c.popnames = popnames; + c.rowsFromInd = rowsFromInd; c.data = data; c.npops = npops; + c.noalle = noalle; c.mixtureType = 'mix'; + c.logml = logml; c.changesInLogml = changesInLogml; +% save([pathname filename], 'c'); + save([pathname filename], 'c', '-v7.3'); % added by Lu Cheng, 08.06.2012 +else + if exist('baps4_output.baps','file') + delete('baps4_output.baps') + end +end + + + + + +%------------------------------------------------------------------------------------- +%------------------------------------------------------------------------------------- + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; + + +%------------------------------------------------------------------------------------- + + +function rows = computeRows(rowsFromInd, inds, ninds) +% On annettu yksilöt inds. Funktio palauttaa vektorin, joka +% sisältää niiden rivien numerot, jotka sisältävät yksilöiden +% dataa. + +rows = inds(:, ones(1,rowsFromInd)); +rows = rows*rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [ninds 1]); +rows = rows - miinus; +rows = reshape(rows', [1,rowsFromInd*ninds]); + + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + global PARTITION; + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + + +%-------------------------------------------------------------------------- + + +function [suurin, i2] = arvoSeuraavaTila(muutokset, logml) +% Suorittaa yksilön seuraavan tilan arvonnan + +y = logml + muutokset; % siirron jälkeiset logml:t +y = y - max(y); +y = exp(y); +summa = sum(y); +y = y/summa; +y = cumsum(y); + +i2 = rand_disc(y); % uusi kori +suurin = muutokset(i2); + + +%-------------------------------------------------------------------------------------- + + +function svar=rand_disc(CDF) +%returns an index of a value from a discrete distribution using inversion method +slump=rand; +har=find(CDF>slump); +svar=har(1); + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, rowsFromInd, diffInCounts, ... + adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun yksil?ind +% on siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2( ... + i1, i2, rowsFromInd, diffInCounts, adjprior, priorTerm); +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksilöt siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2, adjprior, priorTerm); + + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, rowsFromInd, diffInCounts, ... + adjprior, priorTerm, i2); +% Suorittaa globaalien muuttujien päivitykset, kun yksilöt 'muuttuvat' +% siirretään koriin i2. Ennen siirtoa yksilöiden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, rowsFromInd, data, adjprior, priorTerm) +% Palauttaa yksilöt järjestyksess?siten, ett?ensimmäisen?on +% se, jonka poistaminen populaatiosta pop nostaisi logml:n +% arvoa eniten. + +global COUNTS; global SUMCOUNTS; +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind = inds(i); + rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop) = COUNTS(:,:,pop)-diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)-diffInSumCounts; + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior, priorTerm); + COUNTS(:,:,pop) = COUNTS(:,:,pop)+diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)+diffInSumCounts; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = ... + laskeMuutokset(ind, rowsFromInd, data, adjprior, priorTerm) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?ind siirretään koriin i. +% diffInCounts on poistettava COUNTS:in siivusta i1 ja lisättäv? +% COUNTS:in siivuun i2, mikäli muutos toteutetaan. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +i1 = PARTITION(ind); +i1_logml = POP_LOGML(i1); + +rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); +new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2( ... + i1, rowsFromInd, data, adjprior, priorTerm); +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli korin i1 kaikki yksilöt siirretään +% koriin i. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +i1_logml = POP_LOGML(i1); + +inds = find(PARTITION==i1); +ninds = length(inds); + +if ninds==0 + diffInCounts = zeros(size(COUNTS,1), size(COUNTS,2)); + return; +end + +rows = computeRows(rowsFromInd, inds, ninds); + +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); +new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + + +%------------------------------------------------------------------------------------ + + +function muutokset = laskeMuutokset3(T2, inds2, rowsFromInd, ... + data, adjprior, priorTerm, i1) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mik?olisi muutos logml:ss? jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirretään koriin j. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +i1_logml = POP_LOGML(i1); + +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = computeRows(rowsFromInd, inds, ninds); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); + new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm)'; + COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + + muutokset(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + + +%------------------------------------------------------------------------------------ + +function muutokset = laskeMuutokset5(inds, rowsFromInd, data, adjprior, ... + priorTerm, i1, i2) + +% Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?i vaihtaisi koria i1:n ja i2:n välill? + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; + +ninds = length(inds); +muutokset = zeros(ninds,1); + +i1_logml = POP_LOGML(i1); +i2_logml = POP_LOGML(i2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %mist? + pop2 = i2; %mihin + else + pop1 = i2; + pop2 = i1; + end + rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)-diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts; + + PARTITION(ind) = pop2; + + new_logmls = computePopulationLogml([i1 i2], adjprior, priorTerm); + + muutokset(i) = sum(new_logmls); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; + + PARTITION(ind) = pop1; +end + +muutokset = muutokset - i1_logml - i2_logml; + +%-------------------------------------------------------------------------- + + + +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n +% riveill?rows. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + + + +%------------------------------------------------------------------------------------ + + +function popLogml = computePopulationLogml(pops, adjprior, priorTerm) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS(pops,:)),2) - priorTerm; + + +%----------------------------------------------------------------------------------- + + +function npops = poistaTyhjatPopulaatiot(npops) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; + +notEmpty = find(any(SUMCOUNTS,2)); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%---------------------------------------------------------------------------------- +%Seuraavat kolme funktiota liittyvat alkupartition muodostamiseen. + +function initial_partition=admixture_initialization(data_matrix,nclusters,Z) +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +T=cluster_own(Z,nclusters); +initial_partition=zeros(size_data(1),1); +for i=1:n + kori=T(i); + here=find(data_matrix(:,end)==i); + for j=1:length(here) + initial_partition(here(j),1)=kori; + end +end + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + + +%--------------------------------------------------------------------------------------- + + +function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? +% kyseinen rivi on peräisin. Funktio tutkii ensin, ett?montako +% rivi?maksimissaan on peräisin yhdelt?yksilölt? jolloin saadaan +% tietää onko kyseess?haploidi, diploidi jne... Tämän jälkeen funktio +% lisää tyhji?rivej?niille yksilöille, joilta on peräisin vähemmän +% rivej?kuin maksimimäär? +% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin +% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytöss?oleva koodi. +% Tämän jälkeen funktio muuttaa alleelikoodit siten, ett?yhden lokuksen j +% koodit saavat arvoja välill?1,...,noalle(j). +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +dataApu = []; nollat = []; isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +nrows = size(data,1); +ncols = size(data,2); +rowsFromInd = zeros(nind,1); +for i=1:nind + rowsFromInd(i) = length(find(data(:,end)==i)); +end +maxRowsFromInd = max(rowsFromInd); +a = -999; +emptyRow = repmat(a, 1, ncols); +lessThanMax = find(rowsFromInd < maxRowsFromInd); +missingRows = maxRowsFromInd*nind - nrows; +data = [data; zeros(missingRows, ncols)]; +pointer = 1; +for ind=lessThanMax' %Käy läpi ne yksilöt, joilta puuttuu rivej? + miss = maxRowsFromInd-rowsFromInd(ind); % Tält?yksilölt?puuttuvien lkm. + for j=1:miss + rowToBeAdded = emptyRow; + rowToBeAdded(end) = ind; + data(nrows+pointer, :) = rowToBeAdded; + pointer = pointer+1; + end +end +data = sortrows(data, ncols); % Sorttaa yksilöiden mukaisesti +newData = data; +rowsFromInd = maxRowsFromInd; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + + +%---------------------------------------------------------------------------------------- + + +function [Z, dist] = newGetDistances(data, rowsFromInd) + +ninds = max(data(:,end)); +nloci = size(data,2)-1; +riviLkm = nchoosek(ninds,2); + +empties = find(data<0); +data(empties)=0; +data = uint8(data); % max(noalle) oltava <256 + +pariTaulu = zeros(riviLkm,2); +aPointer=1; +for a=1:ninds-1 + pariTaulu(aPointer:aPointer+ninds-1-a,1) = ones(ninds-a,1)*a; + pariTaulu(aPointer:aPointer+ninds-1-a,2) = (a+1:ninds)'; + aPointer = aPointer+ninds-a; +end + +eka = pariTaulu(:,ones(1,rowsFromInd)); +eka = eka * rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]); +eka = eka - miinus; + +toka = pariTaulu(:,ones(1,rowsFromInd)*2); +toka = toka * rowsFromInd; +toka = toka - miinus; + +%eka = uint16(eka); +%toka = uint16(toka); + +summa = zeros(riviLkm,1); +vertailuja = zeros(riviLkm,1); + +clear pariTaulu; clear miinus; + +x = zeros(size(eka)); x = uint8(x); +y = zeros(size(toka)); y = uint8(y); + +for j=1:nloci; + + for k=1:rowsFromInd + x(:,k) = data(eka(:,k),j); + y(:,k) = data(toka(:,k),j); + end + + for a=1:rowsFromInd + for b=1:rowsFromInd + vertailutNyt = double(x(:,a)>0 & y(:,b)>0); + vertailuja = vertailuja + vertailutNyt; + lisays = (x(:,a)~=y(:,b) & vertailutNyt); + summa = summa+double(lisays); + end + end +end + +clear x; clear y; clear vertailutNyt; +nollat = find(vertailuja==0); +dist = zeros(length(vertailuja),1); +dist(nollat) = 1; +muut = find(vertailuja>0); +dist(muut) = summa(muut)./vertailuja(muut); +clear summa; clear vertailuja; + +Z = linkage(dist'); + + +%---------------------------------------------------------------------------------------- + + +function [Z, distances]=getDistances(data_matrix,nclusters) + +%finds initial admixture clustering solution with nclusters clusters, uses simple mean Hamming distance +%gives partition in 8-bit format +%allocates all alleles of a single individual into the same basket +%data_matrix contains #Loci+1 columns, last column indicate whose alleles are placed in each row, +%i.e. ranges from 1 to #individuals. For diploids there are 2 rows per individual, for haploids only a single row +%missing values are indicated by zeros in the partition and by negative integers in the data_matrix. + +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +distances=zeros(nchoosek(n,2),1); +pointer=1; +for i=1:n-1 + i_data=data_matrix(find(data_matrix(:,end)==i),1:nloci); + for j=i+1:n + d_ij=0; + j_data=data_matrix(find(data_matrix(:,end)==j),1:nloci); + vertailuja = 0; + for k=1:size(i_data,1) + for l=1:size(j_data,1) + here_i=find(i_data(k,:)>=0); + here_j=find(j_data(l,:)>=0); + here_joint=intersect(here_i,here_j); + vertailuja = vertailuja + length(here_joint); + d_ij = d_ij + length(find(i_data(k,here_joint)~=j_data(l,here_joint))); + end + end + d_ij = d_ij / vertailuja; + distances(pointer)=d_ij; + pointer=pointer+1; + end +end + +Z=linkage(distances'); + + + +%---------------------------------------------------------------------------------------- + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile, indexFile) +%Palauttaa tyhjän, mikäli nimitiedosto ja indeksitiedosto +% eivät olleet yht?pitki? + +popnames = []; +indices = load(indexFile); + +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) & ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +if length(names) ~= length(indices) + disp('The number of population names must be equal to the number of '); + disp('entries in the file specifying indices of the first individuals of '); + disp('each population.'); + return; +end + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = indices(i); +end + + +%----------------------------------------------------------------------------------- + + +%-------------------------------------------------------------------------- + +function logml = ... + initialCounts(partition, data, npops, rows, noalle, adjprior) + +nloci=size(data,2); +ninds = size(rows, 1); + +koot = rows(:,1) - rows(:,2) + 1; +maxSize = max(koot); + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +%initializeGammaln(ninds, maxSize, max(noalle)); + +logml = laskeLoggis(counts, sumcounts, adjprior); + + +%----------------------------------------------------------------------- + + +function logml=computeLogml(counts, sumcounts, noalle, data, rowsFromInd) +nloci = size(counts,2); +npops = size(counts,3); +adjnoalle = zeros(max(noalle),nloci); +for j=1:nloci + adjnoalle(1:noalle(j),j)=noalle(j); + if (noalle(j)0 + fid = fopen(outPutFile,'a'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:'); +disp(['Data file: ' inputFile]); +disp(['Model: independent']); +disp(['Number of clustered individuals: ' ownNum2Str(ninds)]); +disp(['Number of groups in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%s \n', ['RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Data file: ' inputFile]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clustered individuals: ' ownNum2Str(ninds)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of groups in optimal partition: ' ownNum2Str(npops)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); fprintf(fid,'\n'); +end + +cluster_count = length(unique(PARTITION)); +disp(['Best Partition: ']); +if (fid ~= -1) + fprintf(fid,'%s \n',['Best Partition: ']); fprintf(fid,'\n'); +end +for m=1:cluster_count + indsInM = find(PARTITION==m); + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + if names + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + else + text = ['Cluster ' num2str(m) ': {' num2str(indsInM(1))]; + for k = 2:cluster_size + text = [text ', ' num2str(indsInM(k))]; + end; + end + text = [text '}']; + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',[new_line]); + fprintf(fid,'\n'); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',[text]); + fprintf(fid,'\n'); + end + end; +end + +if npops > 1 + disp(' '); + disp(' '); + disp('Changes in log(marginal likelihood) if indvidual i is moved to group j:'); + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Changes in log(marginal likelihood) if indvidual i is moved to group j:']); fprintf(fid, '\n'); + end + + if names + nameSizes = zeros(ninds,1); + for i = 1:ninds + nimi = char(popnames{i}); + nameSizes(i) = length(nimi); + end + maxSize = max(nameSizes); + maxSize = max(maxSize, 5); + erotus = maxSize - 5; + alku = blanks(erotus); + ekarivi = [alku ' ind' blanks(6+erotus)]; + else + ekarivi = ' ind '; + end + for i = 1:cluster_count + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; + end + disp(ekarivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); fprintf(fid, '\n'); + end + + %ninds = size(data,1)/rowsFromInd; + changesInLogml = LOGDIFF'; + for ind = 1:ninds + %[muutokset, diffInCounts] = laskeMuutokset(ind, rowsFromInd, data, ... + % adjprior, priorTerm); + %changesInLogml(:,ind) = muutokset; + muutokset = changesInLogml(:,ind); + + if names + nimi = char(popnames{ind}); + rivi = [blanks(maxSize - length(nimi)) nimi ':']; + else + rivi = [blanks(4-floor(log10(ind))) ownNum2Str(ind) ':']; + end + for j = 1:npops + rivi = [rivi ' ' logml2String(omaRound(muutokset(j)))]; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + end + end + + disp(' '); disp(' '); + disp('KL-divergence matrix in PHYLIP format:'); + + dist_mat = zeros(npops, npops); + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['KL-divergence matrix in PHYLIP format:']); %fprintf(fid, '\n'); + end + + maxnoalle = size(COUNTS,1); + nloci = size(COUNTS,2); + d = zeros(maxnoalle, nloci, npops); + prior = adjprior; + prior(find(prior==1))=0; + nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. + prior(1,nollia)=1; + for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); + end +% ekarivi = blanks(7); +% for pop = 1:npops +% ekarivi = [ekarivi num2str(pop) blanks(7-floor(log10(pop)))]; +% end + ekarivi = num2str(npops); + disp(ekarivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); %fprintf(fid, '\n'); + end + + for pop1 = 1:npops + % rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + % rivi = [rivi kldiv2str(div) ' ']; + dist_mat(pop1,pop2) = div; + end + % disp(rivi); + % if (fid ~= -1) + % fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + % end + end + + + dist_mat = dist_mat + dist_mat'; % make it symmetric + for pop1 = 1:npops + rivi = ['Cluster_' num2str(pop1) ' ']; + for pop2 = 1:npops + rivi = [rivi kldiv2str(dist_mat(pop1,pop2)) ' ']; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); %fprintf(fid, '\n'); + end + end +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['List of sizes of 10 best visited partitions and corresponding log(ml) values']); fprintf(fid, '\n'); +end + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +partitionSummary = partitionSummary(find(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1)) ' ' num2str(partitionSummary(part,2))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end +end + +if ~fixedK + disp(' '); + disp(' '); + disp('Probabilities for number of clusters'); + + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Probabilities for number of clusters']); fprintf(fid, '\n'); + end + + npopsTaulu = unique(partitionSummary(:,1)); + len = length(npopsTaulu); + probs = zeros(len,1); + partitionSummary(:,2) = partitionSummary(:,2)-max(partitionSummary(:,2)); + sumtn = sum(exp(partitionSummary(:,2))); + for i=1:len + npopstn = sum(exp(partitionSummary(find(partitionSummary(:,1)==npopsTaulu(i)),2))); + probs(i) = npopstn / sumtn; + end + for i=1:len + if probs(i)>1e-5 + line = [num2str(npopsTaulu(i)) ' ' num2str(probs(i))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end + end + end +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); + +%-------------------------------------------------------------- + +function num2 = omaRound(num) +% Pyöristää luvun num 1 desimaalin tarkkuuteen +num = num*10; +num = round(num); +num2 = num/10; + +%--------------------------------------------------------- + + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:in? +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmill? +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + +%-------------------------------------------------------------------- + + +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) & n 1)); +end + +%-------------------------------------------------------------------- + + +function kunnossa = testaaGenePopData(tiedostonNimi) +% kunnossa == 0, jos data ei ole kelvollinen genePop data. +% Muussa tapauksessa kunnossa == 1. + +kunnossa = 0; +fid = fopen(tiedostonNimi); +line1 = fgetl(fid); %ensimmäinen rivi +line2 = fgetl(fid); %toinen rivi +line3 = fgetl(fid); %kolmas + +if (isequal(line1,-1) | isequal(line2,-1) | isequal(line3,-1)) + disp('Incorrect file format 1168'); fclose(fid); + return +end +if (testaaPop(line1)==1 | testaaPop(line2)==1) + disp('Incorrect file format 1172'); fclose(fid); + return +end +if testaaPop(line3)==1 + %2 rivi tällöin lokusrivi + nloci = rivinSisaltamienMjonojenLkm(line2); + line4 = fgetl(fid); + if isequal(line4,-1) + disp('Incorrect file format 1180'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin nelj?täytyy sisältää pilkku. + disp('Incorrect file format 1185'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy + pointer = pointer+1; + end + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1195'); fclose(fid); + return + end +else + line = fgetl(fid); + lineNumb = 4; + while (testaaPop(line)~=1 & ~isequal(line,-1)) + line = fgetl(fid); + lineNumb = lineNumb+1; + end + if isequal(line,-1) + disp('Incorrect file format 1206'); fclose(fid); + return + end + nloci = lineNumb-2; + line4 = fgetl(fid); %Eka rivi pop sanan jälkeen + if isequal(line4,-1) + disp('Incorrect file format 1212'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin täytyy sisältää pilkku. + disp('Incorrect file format 1217'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy. + pointer = pointer+1; + end + + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1228'); fclose(fid); + return + end +end +kunnossa = 1; +fclose(fid); + +%------------------------------------------------------ + + +function [data, popnames] = lueGenePopData(tiedostonNimi) + +fid = fopen(tiedostonNimi); +line = fgetl(fid); %ensimmäinen rivi +line = fgetl(fid); %toinen rivi +count = rivinSisaltamienMjonojenLkm(line); + +line = fgetl(fid); +lokusRiveja = 1; +while (testaaPop(line)==0) + lokusRiveja = lokusRiveja+1; + line = fgetl(fid); +end + +if lokusRiveja>1 + nloci = lokusRiveja; +else + nloci = count; +end + +popnames = cell(10,2); +data = zeros(100, nloci+1); +nimienLkm=0; +ninds=0; +poimiNimi=1; +digitFormat = -1; +while line ~= -1 + line = fgetl(fid); + + if poimiNimi==1 + %Edellinen rivi oli 'pop' + nimienLkm = nimienLkm+1; + ninds = ninds+1; + if nimienLkm>size(popnames,1); + popnames = [popnames; cell(10,2)]; + end + nimi = lueNimi(line); + if digitFormat == -1 + digitFormat = selvitaDigitFormat(line); + divider = 10^digitFormat; + end + popnames{nimienLkm, 1} = {nimi}; %Näin se on greedyMix:issäkin?!? + popnames{nimienLkm, 2} = ninds; + poimiNimi=0; + + data = addAlleles(data, ninds, line, divider); + + elseif testaaPop(line) + poimiNimi = 1; + + elseif line ~= -1 + ninds = ninds+1; + data = addAlleles(data, ninds, line, divider); + end +end + +data = data(1:ninds*2,:); +popnames = popnames(1:nimienLkm,:); +fclose(fid); + +%-------------------------------------------------------- + + +function data = addAlleles(data, ind, line, divider) +% Lisaa BAPS-formaatissa olevaan datataulukkoon +% yksilöä ind vastaavat rivit. Yksilön alleelit +% luetaan genepop-formaatissa olevasta rivist? +% line. Jos data on 3 digit formaatissa on divider=1000. +% Jos data on 2 digit formaatissa on divider=100. + +nloci = size(data,2)-1; +if size(data,1) < 2*ind + data = [data; zeros(100,nloci+1)]; +end + +k=1; +merkki=line(k); +while ~isequal(merkki,',') + k=k+1; + merkki=line(k); +end +line = line(k+1:end); +clear k; clear merkki; + +alleeliTaulu = sscanf(line,'%d'); + +if length(alleeliTaulu)~=nloci + disp('Incorrect data format.'); +end + +for j=1:nloci + ekaAlleeli = floor(alleeliTaulu(j)/divider); + if ekaAlleeli==0 ekaAlleeli=-999; end; + tokaAlleeli = rem(alleeliTaulu(j),divider); + if tokaAlleeli==0 tokaAlleeli=-999; end + + data(2*ind-1,j) = ekaAlleeli; + data(2*ind,j) = tokaAlleeli; +end + +data(2*ind-1,end) = ind; +data(2*ind,end) = ind; + +%------------------------------------------------------ + + +function count = rivinSisaltamienMjonojenLkm(line) +% Palauttaa line:n sisältämien mjonojen lukumäärän. +% Mjonojen väliss?täytyy olla välilyönti. +count = 0; +pit = length(line); +tila = 0; %0, jos odotetaan välilyöntej? 1 jos odotetaan muita merkkej? +for i=1:pit + merkki = line(i); + if (isspace(merkki) & tila==0) + %Ei tehd?mitään. + elseif (isspace(merkki) & tila==1) + tila = 0; + elseif (~isspace(merkki) & tila==0) + tila = 1; + count = count+1; + elseif (~isspace(merkki) & tila==1) + %Ei tehd?mitään + end +end + + +%------------------------------------------------------- + +function nimi = lueNimi(line) +%Palauttaa line:n alusta sen osan, joka on ennen pilkkua. +n = 1; +merkki = line(n); +nimi = ''; +while ~isequal(merkki,',') + nimi = [nimi merkki]; + n = n+1; + merkki = line(n); +end + +%------------------------------------------------------- + +function df = selvitaDigitFormat(line) +% line on ensimmäinen pop-sanan jälkeinen rivi +% Genepop-formaatissa olevasta datasta. funktio selvittää +% rivin muodon perusteella, ovatko datan alleelit annettu +% 2 vai 3 numeron avulla. + +n = 1; +merkki = line(n); +while ~isequal(merkki,',') + n = n+1; + merkki = line(n); +end + +while ~any(merkki == '0123456789'); + n = n+1; + merkki = line(n); +end +numeroja = 0; +while any(merkki == '0123456789'); + numeroja = numeroja+1; + n = n+1; + merkki = line(n); +end + +df = numeroja/2; + + + +function loggis = laskeLoggis(counts, sumcounts, adjprior) +npops = size(counts,3); + +logml2 = sum(sum(sum(gammaln(counts+repmat(adjprior,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior))) - ... + sum(sum(gammaln(1+sumcounts))); +loggis = logml2; \ No newline at end of file diff --git a/matlab/independent/greedyPopMix.m b/matlab/independent/greedyPopMix.m new file mode 100644 index 0000000..14006cf --- /dev/null +++ b/matlab/independent/greedyPopMix.m @@ -0,0 +1,1685 @@ +function greedyPopMix + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +clearGlobalVars; + +% check whether fixed k mode is selected +h0 = findobj('Tag','fixk_menu'); +fixedK = get(h0, 'userdata'); + +if fixedK + if ~(fixKWarning == 1) % call function fixKWarning + return + end +end + +% check whether partition compare mode is selected +h1 = findobj('Tag','partitioncompare_menu'); +partitionCompare = get(h1, 'userdata'); + +% LASKENNAN ALKUARVOJEN MÄÄRITTÄMINEN + +input_type = questdlg('Specify the format of your data: ',... + 'Specify Data Format', ... + 'BAPS-format', 'GenePop-format', 'Preprocessed data', ... + 'BAPS-format'); + +if isempty(input_type) + return +end + +if isequal(input_type,'BAPS-format') %Raakadata + waitALittle; + [filename, pathname] = uigetfile('*.txt', 'Load data in BAPS-format'); + if filename==0 + return; + end + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname filename]); + end + data = load([pathname filename]); + ninds = testaaOnkoKunnollinenBapsData(data); %TESTAUS + if (ninds==0) + disp('Incorrect Data-file.'); + return; + end + [data, rows, alleleCodes, noalle, adjprior, priorTerm] = handlePopData(data); + rowsFromInd = 0; %Ei tiedet? + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + + load_names = questdlg('Do you wish to specify the names of the groups?',... + 'Input group names?','Yes','No','Yes'); + if isequal(load_names,'Yes') + waitALittle; + [filename, pathname] = uigetfile('*.txt', 'Load group names'); + popnames = initPopNames([pathname filename]); + if (size(popnames,1)~=ninds) + disp('Incorrect name-file.'); + popnames = []; + end + else + popnames = []; + end + +elseif isequal(input_type,'GenePop-format') + waitALittle; + [filename, pathname] = uigetfile('*.txt', 'Load data in GenePop-format'); + if filename==0 + return; + end + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname filename]); + end + kunnossa = testaaGenePopData([pathname filename]); + if kunnossa==0 + return + end + + [data, popnames]=lueGenePopDataPop([pathname filename]); + [data, rows, alleleCodes, noalle, adjprior, priorTerm] = handlePopData(data); + rowsFromInd = 2; %Tiedetään GenePop:in tapauksessa. + + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; +end + +if ~isequal(input_type, 'Preprocessed data') + a_data = data(:,1:end-1); + + npops = size(rows,1); + PARTITION = 1:npops'; %Jokainen "yksil? eli populaatio on oma ryhmäns? + [sumcounts, counts, logml] = ... + initialPopCounts(a_data, npops, rows, noalle, adjprior); + COUNTS = counts; SUMCOUNTS = sumcounts; + POP_LOGML = computePopulationLogml(1:npops, adjprior, priorTerm); + + clear('counts', 'sumcounts','pathname','filename','vast2',... + 'vast3','vast4'); + [Z,dist] = getPopDistancesByKL(adjprior); %Saadaan COUNTS:in avulla. + + save_preproc = questdlg('Do you wish to save pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + kokonimi = [pathname filename]; + c.data = data; c.rows = rows; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.dist = dist; c.Z = Z; c.popnames = popnames; c.rowsFromInd = rowsFromInd; + c.npops = npops; c.logml = logml; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % Lu Cheng, 08.06.2012 + clear c; + end; +end + +if isequal(input_type, 'Preprocessed data') + waitALittle; + [filename, pathname] = uigetfile('*.mat', 'Load pre-processed data'); + if filename==0 + return; + end + + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname filename]); + end + + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + %load([pathname filename],'c'); + %if ~exist('c') %TESTAUS + % disp('Incorrect file format.'); + % return + %elseif ~isfield(c,'rows') + % disp('Incorrect file format.'); + % return + %end + struct_array = load([pathname filename]); + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'rows') + disp('Incorrect file format'); + return + end + elseif isfield(struct_array,'rows') %Mideva versio + c = struct_array; + else + disp('Incorrect file format'); + return; + end + data = double(c.data); rows = c.rows; alleleCodes = c.alleleCodes; + noalle = c.noalle; adjprior = c.adjprior; priorTerm = c.priorTerm; + dist = c.dist; Z = c.Z; popnames = c.popnames; rowsFromInd = c.rowsFromInd; + clear c; +end + +c.data=data; c.rows = rows; c.alleleCodes = alleleCodes; +c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; +c.dist=dist; c.Z=Z; c.rowsFromInd = rowsFromInd; + +% partition compare +if ~isempty(partitionCompare) + nsamplingunits = size(rows,1); + partitions = partitionCompare.partitions; + npartitions = size(partitions,2); + partitionLogml = zeros(1,npartitions); + for i = 1:npartitions + % number of unique partition lables + npops = length(unique(partitions(:,i))); + try + partitionInd = zeros(rows(end),1); + partitionSample = partitions(:,i); + for j = 1: nsamplingunits + partitionInd([c.rows(j,1):c.rows(j,2)]) = partitionSample(j); + end + partitionLogml(i) = ... + initialCounts(partitionInd, data(:,1:end-1), npops, c.rows, noalle, adjprior); + catch + disp('*** ERROR: unmatched data.'); + return + end + end + % return the logml result + partitionCompare.logmls = partitionLogml; + set(h1, 'userdata', partitionCompare); + return +end + +if fixedK + [logml, npops, partitionSummary]=indMix_fixK(c); +else + [logml, npops, partitionSummary]=indMix(c); +end + +if logml==1 + return +end + +data = data(:,1:end-1); +viewPopMixPartition(PARTITION, rows, popnames); +%npops = poistaTyhjatPopulaatiot(npops); +%POP_LOGML = computePopulationLogml(1:npops, adjprior, priorTerm); + +h0 = findobj('Tag','filename1_text'); inp = get(h0,'String'); +h0 = findobj('Tag','filename2_text'); +outp = get(h0,'String'); +changesInLogml = writeMixtureInfoPop(logml, rows, data, adjprior, priorTerm, ... + outp,inp,partitionSummary, popnames, fixedK); + +talle = questdlg(['Do you want to save the mixture populations ' ... + 'so that you can use them later in admixture analysis?'], ... + 'Save results?','Yes','No','Yes'); +if isequal(talle,'Yes') + waitALittle; + [filename, pathname] = uiputfile('*.mat','Save results as'); + + if (filename == 0) & (pathname == 0) + % Cancel was pressed + return + else % copy 'baps4_output.baps' into the text file with the same name. + if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',[pathname filename '.txt']) + delete('baps4_output.baps') + end + end + + if rowsFromInd==0 + %Käytettiin BAPS-formaattia, eik?rowsFromInd ole tunnettu. + [popnames, rowsFromInd] = findOutRowsFromInd(popnames, rows); + end + + groupPartition = PARTITION; + + fiksaaPartitioYksiloTasolle(rows, rowsFromInd); + + c.PARTITION = PARTITION; c.COUNTS = COUNTS; c.SUMCOUNTS = SUMCOUNTS; + c.alleleCodes = alleleCodes; c.adjprior = adjprior; + c.rowsFromInd = rowsFromInd; c.popnames = popnames; + c.data = data; c.npops = npops; c.noalle = noalle; + c.mixtureType = 'popMix'; c.groupPartition = groupPartition; + c.rows = rows; c.logml = logml; c.changesInLogml = changesInLogml; +% save([pathname filename], 'c'); + save([pathname filename], 'c', '-v7.3'); % added by Lu Cheng, 08.06.2012 +else + if exist('baps4_output.baps','file') + delete('baps4_output.baps') + end +end + +%-------------------------------------------------------------------------- + + +function [newData, rows, alleleCodes, noalle, adjprior, priorTerm] = handlePopData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? +% kyseinen rivi on peräisin. Funktio muuttaa alleelikoodit +% siten, ett?yhden lokuksen j koodit saavat arvoja +% välill?1,...,noalle(j). Ennen tät?muutosta alleeli, jonka +% koodi on nolla muutetaan. + + +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +dataApu = []; nollat = []; isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +%rows = cell(nind,1); +rows = zeros(nind,2); +for i=1:nind + rivit = find(data(:,end)==i)'; + rows(i,1) = min(rivit); + rows(i,2) = max(rivit); +end +newData = data; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + + +%---------------------------------------------------------------- + + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; + + +%-------------------------------------------------------------------- + +function [Z,distances] = getPopDistancesByKL(adjprior) +% Laskee populaatioille etäisyydet +% käyttäen KL-divergenssi? +global COUNTS; +maxnoalle = size(COUNTS,1); +nloci = size(COUNTS,2); +npops = size(COUNTS,3); +distances = zeros(nchoosek(npops,2),1); + +d = zeros(maxnoalle, nloci, npops); +prior = adjprior; +prior(find(prior==1))=0; +nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. +prior(1,nollia)=1; +for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); +end +pointer = 1; +for pop1 = 1:npops-1 + for pop2 = pop1+1:npops + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + distances(pointer) = div; + pointer = pointer+1; + end +end +Z=linkage(distances'); + +%-------------------------------------------------------------------------- + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + +%----------------------------------------------------------------------- + + +function [sumcounts, counts, logml] = ... + initialPopCounts(data, npops, rows, noalle, adjprior) + +nloci=size(data,2); +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); + +for i=1:npops + for j=1:nloci + i_rivit = rows(i,1):rows(i,2); + havainnotLokuksessa = find(data(i_rivit,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(i_rivit,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +logml = laskeLoggis(counts, sumcounts, adjprior); + + +%----------------------------------------------------------------------- + + +function loggis = laskeLoggis(counts, sumcounts, adjprior) +npops = size(counts,3); + +logml2 = sum(sum(sum(gammaln(counts+repmat(adjprior,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior))) - ... + sum(sum(gammaln(1+sumcounts))); +loggis = logml2; + + +%-------------------------------------------------------------------- + + +function kunnossa = testaaGenePopData(tiedostonNimi) +% kunnossa == 0, jos data ei ole kelvollinen genePop data. +% Muussa tapauksessa kunnossa == 1. + +kunnossa = 0; +fid = fopen(tiedostonNimi); +line1 = fgetl(fid); %ensimmäinen rivi +line2 = fgetl(fid); %toinen rivi +line3 = fgetl(fid); %kolmas + +if (isequal(line1,-1) | isequal(line2,-1) | isequal(line3,-1)) + disp('Incorrect file format'); fclose(fid); + return +end +if (testaaPop(line1)==1 | testaaPop(line2)==1) + disp('Incorrect file format'); fclose(fid); + return +end +if testaaPop(line3)==1 + %2 rivi tällöin lokusrivi + nloci = rivinSisaltamienMjonojenLkm(line2); + line4 = fgetl(fid); + if isequal(line4,-1) + disp('Incorrect file format'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin nelj?täytyy sisältää pilkku. + disp('Incorrect file format'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy + pointer = pointer+1; + end + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format'); fclose(fid); + return + end +else + line = fgetl(fid); + lineNumb = 4; + while (testaaPop(line)~=1 & ~isequal(line,-1)) + line = fgetl(fid); + lineNumb = lineNumb+1; + end + if isequal(line,-1) + disp('Incorrect file format'); fclose(fid); + return + end + nloci = lineNumb-2; + line4 = fgetl(fid); %Eka rivi pop sanan jälkeen + if isequal(line4,-1) + disp('Incorrect file format'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin täytyy sisältää pilkku. + disp('Incorrect file format'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy. + pointer = pointer+1; + end + + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format'); fclose(fid); + return + end +end +kunnossa = 1; +fclose(fid); + +%-------------------------------------------------------------------- + + +function [data, popnames] = lueGenePopDataPop(tiedostonNimi) +% Data annetaan muodossa, jossa viimeinen sarake kertoo ryhmän. +% popnames on kuten ennenkin. + +fid = fopen(tiedostonNimi); +line = fgetl(fid); %ensimmäinen rivi +line = fgetl(fid); %toinen rivi +count = rivinSisaltamienMjonojenLkm(line); + +line = fgetl(fid); +lokusRiveja = 1; +while (testaaPop(line)==0) + lokusRiveja = lokusRiveja+1; + line = fgetl(fid); +end + +if lokusRiveja>1 + nloci = lokusRiveja; +else + nloci = count; +end + +popnames = cell(10,2); +data = zeros(100, nloci+1); +nimienLkm=0; +ninds=0; +poimiNimi=1; +digitFormat = -1; +while line ~= -1 + line = fgetl(fid); + + if poimiNimi==1 + %Edellinen rivi oli 'pop' + nimienLkm = nimienLkm+1; + ninds = ninds+1; + if nimienLkm>size(popnames,1); + popnames = [popnames; cell(10,2)]; + end + nimi = lueNimi(line); + if digitFormat == -1 + digitFormat = selvitaDigitFormat(line); + divider = 10^digitFormat; + end + popnames{nimienLkm, 1} = {nimi}; %Näin se on greedyMix:issäkin?!? + popnames{nimienLkm, 2} = ninds; + poimiNimi=0; + + data = addAlleles(data, ninds, line, divider); + + elseif testaaPop(line) + poimiNimi = 1; + + elseif line ~= -1 + ninds = ninds+1; + data = addAlleles(data, ninds, line, divider); + end +end + +fclose(fid); +data = data(1:ninds*2,:); +popnames = popnames(1:nimienLkm,:); +npops = size(popnames,1); +ind = 1; +for pop = 1:npops + if pop=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + +%------------------------------------------------------------------------ + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, diffInCounts, ... + adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun yksil?ind +% on siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%-------------------------------------------------------------------------- +%-- + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2( ... + i1, globalRows, data, adjprior, priorTerm); +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli korin i1 kaikki yksilöt siirretään +% koriin i. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +i1_logml = POP_LOGML(i1); + +inds = find(PARTITION==i1); +ninds = length(inds); + +if ninds==0 + diffInCounts = zeros(size(COUNTS,1), size(COUNTS,2)); + return; +end + +rows = []; +for i = 1:ninds + ind = inds(i); + lisa = globalRows(ind,1):globalRows(ind,2); + rows = [rows; lisa']; + %rows = [rows; globalRows{ind}']; +end + +diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); +new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2( ... + i1, i2, diffInCounts, adjprior, priorTerm); +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksilöt siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2, adjprior, priorTerm); + + +%-------------------------------------------------------------------------- +%---- + +function muutokset = laskeMuutokset3(T2, inds2, globalRows, ... + data, adjprior, priorTerm, i1) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mik?olisi muutos logml:ss? jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirretään koriin j. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +i1_logml = POP_LOGML(i1); +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = []; + for i = 1:ninds + ind = inds(i); + lisa = globalRows(ind,1):globalRows(ind,2); + rows = [rows; lisa']; + %rows = [rows; globalRows{ind}']; + end + diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); + new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm)'; + COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + + muutokset(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + +%------------------------------------------------------------------------------------ + +function muutokset = laskeMuutokset5(inds, globalRows, data, adjprior, ... + priorTerm, i1, i2) + +% Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?i vaihtaisi koria i1:n ja i2:n välill? + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; + +ninds = length(inds); +muutokset = zeros(ninds,1); + +i1_logml = POP_LOGML(i1); +i2_logml = POP_LOGML(i2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %mist? + pop2 = i2; %mihin + else + pop1 = i2; + pop2 = i1; + end + rows = globalRows(ind,1):globalRows(ind,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)-diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts; + + new_logmls = computePopulationLogml([i1 i2], adjprior, priorTerm); + muutokset(i) = sum(new_logmls); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; +end + +muutokset = muutokset - i1_logml - i2_logml; + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, i2); +% Suorittaa globaalien muuttujien päivitykset, kun yksilöt 'muuttuvat' +% siirretään koriin i2. Ennen siirtoa yksilöiden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%---------------------------------------------------------------------------- + + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäär? + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + + +%----------------------------------------------------------------------------------- + + +function npops = poistaTyhjatPopulaatiot(npops) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; + +notEmpty = find(any(SUMCOUNTS,2)); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile) + +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) & ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = 0; +end + + +%------------------------------------------------------------------------- + + +function [popnames2, rowsFromInd] = findOutRowsFromInd(popnames, rows) + +ploidisuus = questdlg('Specify the type of individuals in the data: ',... + 'Individual type?', 'Haploid', 'Diploid', 'Tetraploid', ... + 'Diploid'); + +switch ploidisuus +case 'Haploid' + rowsFromInd = 1; +case 'Diploid' + rowsFromInd = 2; +case 'Tetraploid' + rowsFromInd = 4; +end + +if ~isempty(popnames) + for i = 1:size(rows,1) + popnames2{i,1} = popnames{i,1}; + rivi = rows(i,1):rows(i,2); + popnames2{i,2} = (rivi(rowsFromInd))/rowsFromInd; + end +else + popnames2 = []; +end + +%------------------------------------------------------------------ + +function fiksaaPartitioYksiloTasolle(rows, rowsFromInd) + +global PARTITION; +totalRows = 0; +for ind = 1:size(rows,1) + totalRows = totalRows + (rows(ind,2)-rows(ind,1)+1); +end +partitio2 = zeros(totalRows/rowsFromInd,1); + +for ind = 1:size(rows,1) + kaikkiRivit = rows(ind,1):rows(ind,2); + for riviNumero = rowsFromInd:rowsFromInd:length(kaikkiRivit) + %for riviNumero = rowsFromInd:rowsFromInd:length(rows{ind}) + %rivi = rows{ind}(riviNumero); + rivi = kaikkiRivit(riviNumero); + partitio2(rivi/rowsFromInd) = PARTITION(ind); + end +end +PARTITION = partitio2; + +%--------------------------------------------------------------- + + +%-------------------------------------------------------------------- + + +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) & n0 + fid = fopen(outPutFile,'a'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF GROUP LEVEL MIXTURE ANALYSIS:'); +disp(['Data file: ' inputFile]); +disp(['Number of clustered groups: ' ownNum2Str(ninds)]); +disp(['Number of clusters in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%s \n', ['RESULTS OF GROUP LEVEL MIXTURE ANALYSIS:']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Data file: ' inputFile]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clustered groups: ' ownNum2Str(ninds)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clusters in optimal partition: ' ownNum2Str(npops)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); fprintf(fid,'\n'); + fprintf(fid,'\n'); +end + +cluster_count = length(unique(PARTITION)); +disp(['Best Partition: ']); +if (fid ~= -1) + fprintf(fid,'%s \n',['Best Partition: ']); fprintf(fid,'\n'); +end +for m=1:cluster_count + indsInM = find(PARTITION==m); + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + if names + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + else + text = ['Cluster ' num2str(m) ': {' num2str(indsInM(1))]; + for k = 2:cluster_size + text = [text ', ' num2str(indsInM(k))]; + end; + end + text = [text '}']; + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',[new_line]); + fprintf(fid,'\n'); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',[text]); + fprintf(fid,'\n'); + end + end; +end + +if npops > 1 + disp(' '); + disp(' '); + disp('Changes in log(marginal likelihood) if group i is moved to cluster j:'); + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Changes in log(marginal likelihood) if group i is moved to cluster j:']); %fprintf(fid, '\n'); + end + + if names + nameSizes = zeros(ninds,1); + for i = 1:ninds + nimi = char(popnames{i}); + nameSizes(i) = length(nimi); + end + maxSize = max(nameSizes); + maxSize = max(maxSize, 5); + erotus = maxSize - 5; + alku = blanks(erotus); + ekarivi = [alku 'group' blanks(6+erotus)]; + else + ekarivi = 'group '; + end + for i = 1:cluster_count + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; + end + disp(ekarivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); %fprintf(fid, '\n'); + end + + changesInLogml = LOGDIFF'; + for ind = 1:ninds + %[muutokset, diffInCounts] = laskeMuutokset(ind, rows, data, ... + % adjprior, priorTerm); + %changesInLogml(:,ind) = muutokset; + muutokset = changesInLogml(:,ind); + if names + nimi = char(popnames{ind}); + rivi = [blanks(maxSize - length(nimi)) nimi ':']; + else + rivi = [blanks(4-floor(log10(ind))) ownNum2Str(ind) ':']; + end + for j = 1:npops + rivi = [rivi ' ' logml2String(omaRound(muutokset(j)))]; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + end + end + + disp(' '); disp(' '); + disp('KL-divergence matrix in PHYLIP format:'); + dist_mat = zeros(npops, npops); + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['KL-divergence matrix in PHYLIP format:']); %fprintf(fid, '\n'); + end + + maxnoalle = size(COUNTS,1); + nloci = size(COUNTS,2); + d = zeros(maxnoalle, nloci, npops); + prior = adjprior; + prior(find(prior==1))=0; + nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. + prior(1,nollia)=1; + for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); + end +% ekarivi = blanks(7); +% for pop = 1:npops +% ekarivi = [ekarivi num2str(pop) blanks(7-floor(log10(pop)))]; +% end + ekarivi = num2str(npops); + disp(ekarivi); + + if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); %fprintf(fid, '\n'); + end + + for pop1 = 1:npops + rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + % rivi = [rivi kldiv2str(div) ' ']; + dist_mat(pop1,pop2) = div; + end +% disp(rivi); +% if (fid ~= -1) +% fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); +% end + end + + + + dist_mat = dist_mat + dist_mat'; % make it symmetric + for pop1 = 1:npops + rivi = ['Cluster_' num2str(pop1) ' ']; + for pop2 = 1:npops + rivi = [rivi kldiv2str(dist_mat(pop1,pop2)) ' ']; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); %fprintf(fid, '\n'); + end + end + +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['List of sizes of 10 best visited partitions and corresponding log(ml) values']); fprintf(fid, '\n'); +end + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +partitionSummary = partitionSummary(find(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1)) ' ' num2str(partitionSummary(part,2))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end +end + +if ~fixedK + disp(' '); + disp(' '); + disp('Probabilities for number of clusters'); + + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Probabilities for number of clusters']); fprintf(fid, '\n'); + end + + npopsTaulu = unique(partitionSummary(:,1)); + len = length(npopsTaulu); + probs = zeros(len,1); + partitionSummary(:,2) = partitionSummary(:,2)-max(partitionSummary(:,2)); + sumtn = sum(exp(partitionSummary(:,2))); + for i=1:len + npopstn = sum(exp(partitionSummary(find(partitionSummary(:,1)==npopsTaulu(i)),2))); + probs(i) = npopstn / sumtn; + end + for i=1:len + if probs(i)>1e-5 + line = [num2str(npopsTaulu(i)) ' ' num2str(probs(i))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end + end + end +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); + +%-------------------------------------------------------------- + +function num2 = omaRound(num) +% Pyöristää luvun num 1 desimaalin tarkkuuteen +num = num*10; +num = round(num); +num2 = num/10; + +%--------------------------------------------------------- + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:in? +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmill? +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + + +%----------------------------------------------- + + +function ninds = testaaOnkoKunnollinenBapsData(data) +%Tarkastaa onko viimeisess?sarakkeessa kaikki +%luvut 1,2,...,n johonkin n:ään asti. +%Tarkastaa lisäksi, ett?on vähintään 2 saraketta. +if size(data,1)<2 + ninds = 0; return; +end +lastCol = data(:,end); +ninds = max(lastCol); +if ~isequal((1:ninds)',unique(lastCol)) + ninds = 0; return; +end + +%-------------------------------------------------------------------------- +%Seuraavat kolme funktiota liittyvat alkupartition muodostamiseen. + +function initial_partition=admixture_initialization(data_matrix,nclusters, Z) + +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +T=cluster_own(Z,nclusters); +initial_partition=zeros(size_data(1),1); +for i=1:n + kori=T(i); + here=find(data_matrix(:,end)==i); + for j=1:length(here) + initial_partition(here(j),1)=kori; + end +end + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + +%-------------------------------------------------------------------------- +function [logml] = ... + initialCounts(partition, data, npops, rows, noalle, adjprior) + +nloci=size(data,2); +ninds = size(rows, 1); + +%koot = rows(:,1) - rows(:,2) + 1; +%maxSize = max(koot); + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + + +%initializeGammaln(ninds, maxSize, max(noalle)); + +logml = laskeLoggis(counts, sumcounts, adjprior); + + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + global PARTITION; + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + +%-------------------------------------------------------------------------- + +function inds = returnInOrder(inds, pop, globalRows, data, ... + adjprior, priorTerm) +% Palauttaa yksilöt järjestyksess?siten, ett?ensimmäisen?on +% se, jonka poistaminen populaatiosta pop nostaisi logml:n +% arvoa eniten. + +global COUNTS; global SUMCOUNTS; +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind =inds(i); + rows = globalRows(i,1):globalRows(i,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop) = COUNTS(:,:,pop)-diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)-diffInSumCounts; + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior, priorTerm); + COUNTS(:,:,pop) = COUNTS(:,:,pop)+diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)+diffInSumCounts; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhji? +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end diff --git a/matlab/independent/indMix.m b/matlab/independent/indMix.m new file mode 100644 index 0000000..2296a45 --- /dev/null +++ b/matlab/independent/indMix.m @@ -0,0 +1,1234 @@ +function [logml, npops, partitionSummary]=indMix(c,npops,dispText) +% Greedy search algorithm with unknown number of classes for regular +% clustering. +% Input npops is not used if called by greedyMix or greedyPopMix. + +logml = 1; + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +global LOGDIFF; +clearGlobalVars; + +noalle = c.noalle; rows = c.rows; data = c.data; +adjprior = c.adjprior; priorTerm = c.priorTerm; rowsFromInd = c.rowsFromInd; + +if isfield(c,'dist') + dist = c.dist; Z = c.Z; +end + +clear c; + +if nargin < 2 + dispText = 1; + npopstext = []; + ready = false; + teksti = 'Input upper bound to the number of populations (possibly multiple values): '; + while ready == false + npopstextExtra = inputdlg(teksti ,... + 'Input maximum number of populations',1,{'20'}); + drawnow + if isempty(npopstextExtra) % Painettu Cancel:ia + return + end + npopstextExtra = npopstextExtra{1}; + if length(npopstextExtra)>=255 + npopstextExtra = npopstextExtra(1:255); + npopstext = [npopstext ' ' npopstextExtra]; + teksti = 'The input field length limit (255 characters) was reached. Input more values: '; + else + npopstext = [npopstext ' ' npopstextExtra]; + ready = true; + end + end + clear ready; clear teksti; + if isempty(npopstext) | length(npopstext)==1 + return + else + npopsTaulu = str2num(npopstext); + ykkoset = find(npopsTaulu==1); + npopsTaulu(ykkoset) = []; % Mikäli ykkösiä annettu ylärajaksi, ne poistetaan. + if isempty(npopsTaulu) + logml = 1; partitionSummary=1; npops=1; + return + end + clear ykkoset; + end +else + npopsTaulu = npops; +end + +nruns = length(npopsTaulu); + +initData = data; +data = data(:,1:end-1); + +logmlBest = -1e50; +partitionSummary = -1e50*ones(30,2); % Tiedot 30 parhaasta partitiosta (npops ja logml) +partitionSummary(:,1) = zeros(30,1); +worstLogml = -1e50; worstIndex = 1; + +for run = 1:nruns + npops = npopsTaulu(run); + if dispText + dispLine; + disp(['Run ' num2str(run) '/' num2str(nruns) ... + ', maximum number of populations ' num2str(npops) '.']); + end + ninds = size(rows,1); + + initialPartition = admixture_initialization(initData, npops, Z); + [sumcounts, counts, logml] = ... + initialCounts(initialPartition, data, npops, rows, noalle, adjprior); + PARTITION = zeros(ninds, 1); + for i=1:ninds + apu = rows(i); + PARTITION(i) = initialPartition(apu(1)); + end + + COUNTS = counts; SUMCOUNTS = sumcounts; + POP_LOGML = computePopulationLogml(1:npops, adjprior, priorTerm); + LOGDIFF = repmat(-Inf,ninds,npops); + clear initialPartition; clear counts; clear sumcounts; + + % PARHAAN MIXTURE-PARTITION ETSIMINEN + nRoundTypes = 7; + kokeiltu = zeros(nRoundTypes, 1); + roundTypes = [1 1]; %Ykkösvaiheen sykli kahteen kertaan. + ready = 0; vaihe = 1; + + if dispText + disp(' '); + disp(['Mixture analysis started with initial ' num2str(npops) ' populations.']); + end + + while ready ~= 1 + muutoksia = 0; + + if dispText + disp(['Performing steps: ' num2str(roundTypes)]); + end + + for n = 1:length(roundTypes) + + round = roundTypes(n); + kivaluku=0; + + if kokeiltu(round) == 1 %Askelta kokeiltu viime muutoksen jälkeen + + elseif round==0 | round==1 %Yksilön siirtäminen toiseen populaatioon. + inds = 1:ninds; + aputaulu = [inds' rand(ninds,1)]; + aputaulu = sortrows(aputaulu,2); + inds = aputaulu(:,1)'; + muutosNyt = 0; + + for ind = inds + i1 = PARTITION(ind); + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data, adjprior, priorTerm); + + if round==1, [maxMuutos, i2] = max(muutokset); + end + + if (i1~=i2 & maxMuutos>1e-5) + % Tapahtui muutos + muutoksia = 1; + if muutosNyt == 0 + muutosNyt = 1; + if dispText + disp('Action 1'); + end + end + kokeiltu = zeros(nRoundTypes,1); + kivaluku = kivaluku+1; + updateGlobalVariables(ind, i2, diffInCounts,... + adjprior, priorTerm); + logml = logml+maxMuutos; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + end + end + if muutosNyt == 0 + kokeiltu(round) = 1; + end + + elseif round==2 %Populaation yhdistäminen toiseen. + maxMuutos = 0; + for pop = 1:npops + [muutokset, diffInCounts] = laskeMuutokset2(pop, rows, ... + data, adjprior, priorTerm); + [isoin, indeksi] = max(muutokset); + if isoin>maxMuutos + maxMuutos = isoin; + i1 = pop; + i2 = indeksi; + diffInCountsBest = diffInCounts; + end + end + + if maxMuutos>1e-5 + muutoksia = 1; + kokeiltu = zeros(nRoundTypes,1); + updateGlobalVariables2(i1,i2, diffInCountsBest, ... + adjprior, priorTerm); + logml = logml + maxMuutos; + if dispText + disp('Action 2'); + end + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + else + kokeiltu(round) = 1; + end + + + elseif round==3 || round==4 %Populaation jakaminen osiin. + maxMuutos = 0; + ninds = size(rows,1); + for pop = 1:npops + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2>2 + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = linkage(dist2'); + if round==3 + npops2 = max(min(20, floor(ninds2/5)),2); + elseif round==4 + npops2 = 2; %Moneenko osaan jaetaan + end + T2 = cluster_own(Z2, npops2); + muutokset = laskeMuutokset3(T2, inds2, rows, data, ... + adjprior, priorTerm, pop); + [isoin, indeksi] = max(muutokset(1:end)); + if isoin>maxMuutos + maxMuutos = isoin; + muuttuvaPop2 = rem(indeksi,npops2); + if muuttuvaPop2==0, muuttuvaPop2 = npops2; end + muuttuvat = inds2(find(T2==muuttuvaPop2)); + i2 = ceil(indeksi/npops2); + end + end + end + if maxMuutos>1e-5 + muutoksia = 1; + kokeiltu = zeros(nRoundTypes,1); + %rows = computeRows(rowsFromInd, muuttuvat, length(muuttuvat)); + rivit = []; + for i = 1:length(muuttuvat) + ind = muuttuvat(i); + lisa = rows(ind,1):rows(ind,2); + rivit = [rivit; lisa']; + %rivit = [rivit; rows(ind)']; + end + diffInCounts = computeDiffInCounts(rivit', size(COUNTS,1), ... + size(COUNTS,2), data); + i1 = PARTITION(muuttuvat(1)); + updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, i2); + logml = logml + maxMuutos; + if dispText + if round==3 + disp('Action 3'); + else + disp('Action 4'); + end + end + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + + else + kokeiltu(round)=1; + end + elseif round == 5 || round == 6 + j=0; + muutettu = 0; + poplogml = POP_LOGML; + partition = PARTITION; + counts = COUNTS; + sumcounts = SUMCOUNTS; + logdiff = LOGDIFF; + + pops = randperm(npops); + while (j < npops & muutettu == 0) + j = j+1; + pop = pops(j); + totalMuutos = 0; + inds = find(PARTITION==pop); + if round == 5 + aputaulu = [inds rand(length(inds),1)]; + aputaulu = sortrows(aputaulu,2); + inds = aputaulu(:,1)'; + elseif round == 6 + inds = returnInOrder(inds, pop, ... + rows, data, adjprior, priorTerm); + end + + i = 0; + + while (length(inds) > 0 & i < length(inds)) + i = i+1; + ind =inds(i); + + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data, adjprior, priorTerm); + muutokset(pop) = -1e50; % Varmasti ei suurin!!! + [maxMuutos, i2] = max(muutokset); + updateGlobalVariables(ind, i2, diffInCounts,... + adjprior, priorTerm); + + totalMuutos = totalMuutos+maxMuutos; + logml = logml+maxMuutos; + if round == 6 + % Lopetetaan heti kun muutos on positiivinen. + if totalMuutos > 1e-5 + i=length(inds); + end + end + end + + if totalMuutos>1e-5 + kokeiltu = zeros(nRoundTypes,1); + muutettu=1; + if muutoksia==0 + muutoksia = 1; % Ulompi kirjanpito. + if dispText + if round==5 + disp('Action 5'); + else + disp('Action 6'); + end + end + end + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + else + % Missään vaiheessa tila ei parantunut. + % Perutaan kaikki muutokset. + PARTITION = partition; + SUMCOUNTS = sumcounts; + POP_LOGML = poplogml; + COUNTS = counts; + logml = logml - totalMuutos; + LOGDIFF = logdiff; + kokeiltu(round)=1; + end + end + clear partition; clear sumcounts; clear counts; clear poplogml; + + elseif round == 7 + emptyPop = findEmptyPop(npops); + j = 0; + pops = randperm(npops); + muutoksiaNyt = 0; + if emptyPop == -1 + j = npops; + end + while (j < npops) + j = j +1; + pop = pops(j); + inds2 = find(PARTITION == pop); + ninds2 = length(inds2); + if ninds2 > 5 + partition = PARTITION; + sumcounts = SUMCOUNTS; + counts = COUNTS; + poplogml = POP_LOGML; + logdiff = LOGDIFF; + + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = linkage(dist2'); + T2 = cluster_own(Z2, 2); + muuttuvat = inds2(find(T2 == 1)); + + muutokset = laskeMuutokset3(T2, inds2, rows, data, ... + adjprior, priorTerm, pop); + totalMuutos = muutokset(1, emptyPop); + + rivit = []; + for i = 1:length(muuttuvat) + ind = muuttuvat(i); + lisa = rows(ind,1):rows(ind,2); + rivit = [rivit lisa]; + end + diffInCounts = computeDiffInCounts(rivit, size(COUNTS,1), ... + size(COUNTS,2), data); + + updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, emptyPop); + + muutettu = 1; + while (muutettu == 1) + muutettu = 0; + % Siirretään yksilöitä populaatioiden välillä + muutokset = laskeMuutokset5(inds2, rows, data, ... + adjprior, priorTerm, pop, emptyPop); + + [maxMuutos, indeksi] = max(muutokset); + + muuttuva = inds2(indeksi); + if (PARTITION(muuttuva) == pop) + i2 = emptyPop; + else + i2 = pop; + end + + if maxMuutos > 1e-5 + rivit = rows(muuttuva,1):rows(muuttuva,2); + diffInCounts = computeDiffInCounts(rivit, size(COUNTS,1), ... + size(COUNTS,2), data); + updateGlobalVariables3(muuttuva,diffInCounts, ... + adjprior, priorTerm, i2); + muutettu = 1; + totalMuutos = totalMuutos + maxMuutos; + end + + end + + if totalMuutos > 1e-5 + muutoksia = 1; + logml = logml + totalMuutos; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + if muutoksiaNyt == 0 + if dispText + disp('Action 7'); + end + muutoksiaNyt = 1; + end + kokeiltu = zeros(nRoundTypes,1); + j = npops; + else + %palutetaan vanhat arvot + PARTITION = partition; + SUMCOUNTS = sumcounts; + COUNTS = counts; + POP_LOGML = poplogml; + LOGDIFF = logdiff; + end + + end + + end + + if muutoksiaNyt == 0 + kokeiltu(round)=1; + end + + end + end + + if muutoksia == 0 + if vaihe==1 + vaihe = 2; + elseif vaihe==2 + vaihe = 3; + elseif vaihe==3 + vaihe = 4; + elseif vaihe==4; + vaihe = 5; + elseif vaihe==5 + ready = 1; + end + else + muutoksia = 0; + end + + if ready==0 + if vaihe==1 + roundTypes=[1]; + elseif vaihe==2 + roundTypes = [2 1]; + elseif vaihe==3 + roundTypes=[5 5 7]; + elseif vaihe==4 + roundTypes=[4 3 1]; + elseif vaihe==5 + roundTypes=[6 7 2 3 4 1]; + end + end + end + + % TALLENNETAAN + + npops = poistaTyhjatPopulaatiot(npops); + POP_LOGML = computePopulationLogml(1:npops, adjprior, priorTerm); + if dispText + disp(['Found partition with ' num2str(npops) ' populations.']); + disp(['Log(ml) = ' num2str(logml)]); + disp(' '); + end + + if logml>logmlBest + % Päivitetään parasta löydettyä partitiota. + logmlBest = logml; + npopsBest = npops; + partitionBest = PARTITION; + countsBest = COUNTS; + sumCountsBest = SUMCOUNTS; + pop_logmlBest = POP_LOGML; + logdiffbest = LOGDIFF; + end + +end + +logml = logmlBest; +npops = npopsBest; +PARTITION = partitionBest; +COUNTS = countsBest; +SUMCOUNTS = sumCountsBest; +POP_LOGML = pop_logmlBest; +LOGDIFF = logdiffbest; + +%-------------------------------------------------------------------------- + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; +global LOGDIFF; LOGDIFF = []; + +%-------------------------------------------------------------------------- + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + +%----------------------------------------------------------------------- + + +function [sumcounts, counts, logml] = ... + initialPopCounts(data, npops, rows, noalle, adjprior) + +nloci=size(data,2); +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); + +for i=1:npops + for j=1:nloci + i_rivit = rows(i,1):rows(i,2); + havainnotLokuksessa = find(data(i_rivit,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(i_rivit,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +logml = laskeLoggis(counts, sumcounts, adjprior); + +%----------------------------------------------------------------------- + + +function loggis = laskeLoggis(counts, sumcounts, adjprior) +npops = size(counts,3); + +logml2 = sum(sum(sum(gammaln(counts+repmat(adjprior,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior))) - ... + sum(sum(gammaln(1+sumcounts))); +loggis = logml2; + + +%------------------------------------------------------------------------------------ + + +function popLogml = computePopulationLogml(pops, adjprior, priorTerm) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS(pops,:)),2) - priorTerm; +%-------------------------------------------------------------------------- + + +function [muutokset, diffInCounts] = ... + laskeMuutokset(ind, globalRows, data, adjprior, priorTerm) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mikä olisi +% muutos logml:ssä, mikäli yksilö ind siirretään koriin i. +% diffInCounts on poistettava COUNTS:in siivusta i1 ja lisättävä +% COUNTS:in siivuun i2, mikäli muutos toteutetaan. +% +% Lisäys 25.9.2007: +% Otettu käyttöön globaali muuttuja LOGDIFF, johon on tallennettu muutokset +% logml:ssä siirrettäessä yksilöitä toisiin populaatioihin. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +npops = size(COUNTS,3); +muutokset = LOGDIFF(ind,:); + +i1 = PARTITION(ind); +i1_logml = POP_LOGML(i1); +muutokset(i1) = 0; + +rows = globalRows(ind,1):globalRows(ind,2); +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = find(muutokset==-Inf); % Etsitään populaatiot jotka muuttuneet viime kerran jälkeen. +i2 = setdiff(i2,i1); +i2_logml = POP_LOGML(i2); + +ni2 = length(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 ni2]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[ni2 1]); +new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 ni2]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[ni2 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; +LOGDIFF(ind,:) = muutokset; + + +%---------------------------------------------------------------------- + + +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n +% riveillä rows. rows pitää olla vaakavektori. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + +%------------------------------------------------------------------------ + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, diffInCounts, ... + adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun yksilö ind +% on siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +global LOGDIFF; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + + +%-------------------------------------------------------------------------- +%-- + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2( ... + i1, globalRows, data, adjprior, priorTerm); +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mikä olisi +% muutos logml:ssä, mikäli korin i1 kaikki yksilöt siirretään +% koriin i. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +i1_logml = POP_LOGML(i1); + +inds = find(PARTITION==i1); +ninds = length(inds); + +if ninds==0 + diffInCounts = zeros(size(COUNTS,1), size(COUNTS,2)); + return; +end + +rows = []; +for i = 1:ninds + ind = inds(i); + lisa = globalRows(ind,1):globalRows(ind,2); + rows = [rows; lisa']; + %rows = [rows; globalRows{ind}']; +end + +diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); +new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2( ... + i1, i2, diffInCounts, adjprior, priorTerm); +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksilöt siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +global LOGDIFF; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2, adjprior, priorTerm); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + + +%-------------------------------------------------------------------------- +%---- + +function muutokset = laskeMuutokset3(T2, inds2, globalRows, ... + data, adjprior, priorTerm, i1) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mikä olisi muutos logml:ssä, jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirretään koriin j. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +i1_logml = POP_LOGML(i1); +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = []; + for i = 1:ninds + ind = inds(i); + lisa = globalRows(ind,1):globalRows(ind,2); + rows = [rows; lisa']; + %rows = [rows; globalRows{ind}']; + end + diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); + new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm)'; + COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + + muutokset(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + +%------------------------------------------------------------------------------------ + +function muutokset = laskeMuutokset5(inds, globalRows, data, adjprior, ... + priorTerm, i1, i2) + +% Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mikä olisi +% muutos logml:ssä, mikäli yksilö i vaihtaisi koria i1:n ja i2:n välillä. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; + +ninds = length(inds); +muutokset = zeros(ninds,1); + +i1_logml = POP_LOGML(i1); +i2_logml = POP_LOGML(i2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %mistä + pop2 = i2; %mihin + else + pop1 = i2; + pop2 = i1; + end + rows = globalRows(ind,1):globalRows(ind,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)-diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts; + + new_logmls = computePopulationLogml([i1 i2], adjprior, priorTerm); + muutokset(i) = sum(new_logmls); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; +end + +muutokset = muutokset - i1_logml - i2_logml; + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, i2); +% Suorittaa globaalien muuttujien päivitykset, kun yksilöt 'muuttuvat' +% siirretään koriin i2. Ennen siirtoa yksilöiden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +global LOGDIFF; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + + +%---------------------------------------------------------------------------- + + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäärä. + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + + +%----------------------------------------------------------------------------------- + + +function npops = poistaTyhjatPopulaatiot(npops) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; +global LOGDIFF; + +notEmpty = find(any(SUMCOUNTS,2)); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); +LOGDIFF = LOGDIFF(:,notEmpty); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); + +%-------------------------------------------------------------- + +function num2 = omaRound(num) +% Pyöristää luvun num 1 desimaalin tarkkuuteen +num = num*10; +num = round(num); +num2 = num/10; + +%--------------------------------------------------------- +function mjono = logml2String(logml) +% Palauttaa logml:n string-esityksen. + +mjono = ' '; +if abs(logml)<10000 + %Ei tarvita e-muotoa + mjono(7) = palautaYks(abs(logml),-1); + mjono(6) = '.'; + mjono(5) = palautaYks(abs(logml),0); + mjono(4) = palautaYks(abs(logml),1); + mjono(3) = palautaYks(abs(logml),2); + mjono(2) = palautaYks(abs(logml),3); + pointer = 2; + while mjono(pointer)=='0' & pointer<7 + mjono(pointer) = ' '; + pointer=pointer+1; + end + if logml<0 + mjono(pointer-1) = '-'; + end +else + suurinYks = 4; + while abs(logml)/(10^(suurinYks+1)) >= 1 + suurinYks = suurinYks+1; + end + if suurinYks<10 + mjono(7) = num2str(suurinYks); + mjono(6) = 'e'; + mjono(5) = palautaYks(abs(logml),suurinYks-1); + mjono(4) = '.'; + mjono(3) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(2) = '-'; + end + elseif suurinYks>=10 + mjono(6:7) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(logml),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(1) = '-'; + end + end +end + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:inä +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmillä +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + + +%-------------------------------------------------------------------------- +%Seuraavat kolme funktiota liittyvat alkupartition muodostamiseen. + +function initial_partition=admixture_initialization(data_matrix,nclusters, Z) + +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +T=cluster_own(Z,nclusters); +initial_partition=zeros(size_data(1),1); +for i=1:n + kori=T(i); + here=find(data_matrix(:,end)==i); + for j=1:length(here) + initial_partition(here(j),1)=kori; + end +end + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + +%-------------------------------------------------------------------------- + +function [sumcounts, counts, logml] = ... + initialCounts(partition, data, npops, rows, noalle, adjprior) + +nloci=size(data,2); +ninds = size(rows, 1); + +koot = rows(:,1) - rows(:,2) + 1; +maxSize = max(koot); + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +%initializeGammaln(ninds, maxSize, max(noalle)); + +logml = laskeLoggis(counts, sumcounts, adjprior); + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, että annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ssä ei vielä ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyistä partitiota vastaava nclusters:in arvo. Muutoin ei tehdä mitään. + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole vielä kirjattuna summaryyn. + global PARTITION; + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + +%-------------------------------------------------------------------------- + +function inds = returnInOrder(inds, pop, globalRows, data, ... + adjprior, priorTerm) +% Palauttaa yksilöt järjestyksessä siten, että ensimmäisenä on +% se, jonka poistaminen populaatiosta pop nostaisi logml:n +% arvoa eniten. + +global COUNTS; global SUMCOUNTS; +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind =inds(i); + rows = globalRows(i,1):globalRows(i,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop) = COUNTS(:,:,pop)-diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)-diffInSumCounts; + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior, priorTerm); + COUNTS(:,:,pop) = COUNTS(:,:,pop)+diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)+diffInSumCounts; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhjiä +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end diff --git a/matlab/independent/indMix_fixK.m b/matlab/independent/indMix_fixK.m new file mode 100644 index 0000000..568c741 --- /dev/null +++ b/matlab/independent/indMix_fixK.m @@ -0,0 +1,1337 @@ +function [logml, npops, partitionSummary]=indMix_fixK(c,npops,nruns,dispText) +% Greedy search algorithm with fixed number of classes for regular +% clustering. +% Input npops is only used if result is needed for spatialMixture. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +global LOGDIFF; +clearGlobalVars; + +noalle = c.noalle; rows = c.rows; data = c.data; +adjprior = c.adjprior; priorTerm = c.priorTerm; rowsFromInd = c.rowsFromInd; + +if isfield(c,'dist') + dist = c.dist; Z = c.Z; +end + +clear c; + +if nargin < 2 + dispText = 1; + npopstext = []; + teksti = {'Number of populations:', ... + 'Number of runs:'}; + def = {'20', '1'}; + + npopstextExtra = inputdlg(teksti ,... + 'Input parameters for the computation algorithm',1,def); + + if isempty(npopstextExtra) % cancel has been pressed + dispCancel + logml = 1; partitionSummary=1; npops=1; + return + end + npopstext = npopstextExtra{1}; + nrunstext = npopstextExtra{2}; + + clear teksti npopstextExtra; + if isempty(npopstext) + return + else + npopsTable = str2num(npopstext); + npops = npopsTable(1); + + if npops==1 + logml = 1; partitionSummary=1; npops=1; + return + end + nrunsTable = str2num(nrunstext); + nruns = nrunsTable(1); + end +else + dispText = 0; + npopsTaulu = npops; + nruns = 1; +end + +initData = data; +data = data(:,1:end-1); + +logmlBest = -1e50; +partitionSummary = -1e50*ones(30,2); % Tiedot 30 parhaasta partitiosta (npops ja logml) +partitionSummary(:,1) = zeros(30,1); +worstLogml = -1e50; worstIndex = 1; + +logml = 1; + +%summary = cell(nruns,5); +for run = 1:nruns + if dispText + dispLine; + disp(['Run ' num2str(run) '/' num2str(nruns) ... + ', maximum number of populations ' num2str(npops) '.']); + end + ninds = size(rows,1); + + initialPartition = admixture_initialization(initData, npops, Z); + [sumcounts, counts, logml] = ... + initialCounts(initialPartition, data, npops, rows, noalle, adjprior); + PARTITION = zeros(ninds, 1); + for i=1:ninds + apu = rows(i); + PARTITION(i) = initialPartition(apu(1)); + end + COUNTS = counts; SUMCOUNTS = sumcounts; + POP_LOGML = computePopulationLogml(1:npops, adjprior, priorTerm); + LOGDIFF = repmat(-Inf,ninds,npops); + + clear initialPartition; clear counts; clear sumcounts; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + + % PARHAAN MIXTURE-PARTITION ETSIMINEN + nRoundTypes = 7; + kokeiltu = zeros(nRoundTypes, 1); + roundTypes = [1 1]; %Ykkösvaiheen sykli kahteen kertaan. + ready = 0; vaihe = 1; + + if dispText + disp(' '); + disp(['Mixture analysis started with initial ' num2str(npops) ' populations.']); + end + + while ready ~= 1 + muutoksia = 0; + if dispText + disp(['Performing steps: ' num2str(roundTypes)]); + end + for n = 1:length(roundTypes) + + round = roundTypes(n); + + if kokeiltu(round) == 1 %Askelta kokeiltu viime muutoksen jälkeen + + elseif round==0 | round==1 %Yksilön siirtäminen toiseen populaatioon. + inds = randperm(ninds); + muutosNyt = 0; + + for ind = inds + i1 = PARTITION(ind); + if length(find(PARTITION==i1))>1 + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data, adjprior, priorTerm); + [maxMuutos, i2] = max(muutokset); + + if (i1~=i2 & maxMuutos>1e-5) + % Tapahtui muutos + muutoksia = 1; + if muutosNyt == 0 + if dispText + disp('action 1'); + end + muutosNyt = 1; + kokeiltu = zeros(nRoundTypes,1); + end + + updateGlobalVariables(ind, i2, diffInCounts,... + adjprior, priorTerm); + + logml = logml+maxMuutos; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + end + end + end + if muutosNyt == 0 + kokeiltu(round) = 1; + end + + elseif round==2 %Populaatioiden yhdistäminen ja jakaminen. + maxMuutos = -1e50; + poplogml = POP_LOGML; + partition = PARTITION; + counts = COUNTS; + sumcounts = SUMCOUNTS; + logdiff = LOGDIFF; + + % Yhdistetään ensin kaksi populaatiota + for pop = 1:npops + [muutokset, diffInCounts] = laskeMuutokset2(pop, rows, ... + data, adjprior, priorTerm); + muutokset(pop)=-1e50; + [isoin, indeksi] = max(muutokset); + if isoin>maxMuutos + maxMuutos = isoin; + i1 = pop; + i2 = indeksi; + diffInCountsBest = diffInCounts; + end + end + + totalMuutos = maxMuutos; + updateGlobalVariables2(i1,i2, diffInCountsBest, ... + adjprior, priorTerm); + + % Siirretään osa jostain populaatiosta tyhjentyneen + % populaation tilalle. + + emptyPop = i1; + maxMuutos = -1e50; + ninds = size(rows,1); + muuttuvat = []; % added by Lu Cheng, 11.11.2012 + for pop = 1:npops + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2>1 + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = linkage(dist2'); + npops2 = 2; %Moneenko osaan jaetaan + T2 = cluster_own(Z2, npops2); + muutokset = laskeMuutokset3(T2, inds2, rows, data, ... + adjprior, priorTerm, pop); + isoin = muutokset(1,emptyPop); + if isoin>maxMuutos + maxMuutos = isoin; + muuttuvat = inds2(find(T2==1)); + end + end + end + + % added by Lu Cheng, 11.11.2012 + if isempty(muuttuvat) + PARTITION = partition; + SUMCOUNTS = sumcounts; + POP_LOGML = poplogml; + COUNTS = counts; + LOGDIFF = logdiff; + kokeiltu(round) = 1; + continue; + end + + + rivit = []; + for i = 1:length(muuttuvat) + ind = muuttuvat(i); + lisa = rows(ind,1):rows(ind,2); + rivit = [rivit; lisa']; + %rivit = [rivit; rows(ind)']; + end + diffInCounts = computeDiffInCounts(rivit', size(COUNTS,1), ... + size(COUNTS,2), data); + pop = PARTITION(muuttuvat(1)); + inds2 = find(PARTITION == pop); + + updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, emptyPop); + totalMuutos = totalMuutos + maxMuutos; + + % Siirretään yksilöit?populaatioiden välill? + inds = randperm(ninds); + for ind = inds + i1 = PARTITION(ind); + if length(find(PARTITION==i1))>1 + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data, adjprior, priorTerm); + + [maxMuutos, i2] = max(muutokset); + if (i1~=i2 & maxMuutos>1e-5) + updateGlobalVariables(ind, i2, diffInCounts,... + adjprior, priorTerm); + + totalMuutos = totalMuutos+maxMuutos; + end + end + end + + if totalMuutos>1e-5 + if dispText + disp('Action 2'); + end + kokeiltu = zeros(nRoundTypes,1); + logml = logml + totalMuutos; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + + else + PARTITION = partition; + SUMCOUNTS = sumcounts; + POP_LOGML = poplogml; + COUNTS = counts; + LOGDIFF = logdiff; + kokeiltu(round) = 1; + end + + + elseif round==3 || round==4 %Populaation jakaminen osiin. + maxMuutos = 0; + ninds = size(rows,1); + for pop = 1:npops + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2>2 + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = linkage(dist2'); + if round==3 + npops2 = max(min(20, floor(ninds2/2)),2); + elseif round==4 + npops2 = 2; %Moneenko osaan jaetaan + end + T2 = cluster_own(Z2, npops2); + muutokset = laskeMuutokset3(T2, inds2, rows, data, ... + adjprior, priorTerm, pop); + [isoin, indeksi] = max(muutokset(1:end)); + if isoin>maxMuutos + maxMuutos = isoin; + muuttuvaPop2 = rem(indeksi,npops2); + if muuttuvaPop2==0, muuttuvaPop2 = npops2; end + muuttuvat = inds2(find(T2==muuttuvaPop2)); + i2 = ceil(indeksi/npops2); + end + end + end + if maxMuutos>1e-5 + muutoksia = 1; + kokeiltu = zeros(nRoundTypes,1); + %rows = computeRows(rowsFromInd, muuttuvat, length(muuttuvat)); + rivit = []; + for i = 1:length(muuttuvat) + ind = muuttuvat(i); + lisa = rows(ind,1):rows(ind,2); + rivit = [rivit; lisa']; + %rivit = [rivit; rows(ind)']; + end + diffInCounts = computeDiffInCounts(rivit', size(COUNTS,1), ... + size(COUNTS,2), data); + i1 = PARTITION(muuttuvat(1)); + updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, i2); + logml = logml + maxMuutos; + if dispText + if round==3 + disp('action 3'); + else + disp('action 4'); + end + end + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + else + kokeiltu(round)=1; + end + + elseif round == 5 || round == 6 + j=0; + muutettu = 0; + poplogml = POP_LOGML; + partition = PARTITION; + counts = COUNTS; + sumcounts = SUMCOUNTS; + logdiff = LOGDIFF; + + pops = randperm(npops); + while (j < npops & muutettu == 0) + j = j+1; + pop = pops(j); + totalMuutos = 0; + inds = find(PARTITION==pop); + if round == 5 + aputaulu = [inds rand(length(inds),1)]; + aputaulu = sortrows(aputaulu,2); + inds = aputaulu(:,1)'; + elseif round == 6 + inds = returnInOrder(inds, pop, ... + rows, data, adjprior, priorTerm); + end + + i = 0; + + while (length(inds) > 0 & i < length(inds) - 1) + i = i+1; + ind =inds(i); + + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data, adjprior, priorTerm); + muutokset(pop) = -1e50; % Varmasti ei suurin!!! + [maxMuutos, i2] = max(muutokset); + + updateGlobalVariables(ind, i2, diffInCounts,... + adjprior, priorTerm); + + totalMuutos = totalMuutos+maxMuutos; + logml = logml+maxMuutos; + if round == 6 + % Lopetetaan heti kun muutos on positiivinen. + if totalMuutos > 1e-5 + i=length(inds); + end + end + end + + if totalMuutos>1e-5 + kokeiltu = zeros(nRoundTypes,1); + if dispText + disp('Action 6'); + end + muutettu=1; + muutoksia = 1; % Ulompi kirjanpito. + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + + else + % Missään vaiheessa tila ei parantunut. + % Perutaan kaikki muutokset. + PARTITION = partition; + SUMCOUNTS = sumcounts; + POP_LOGML = poplogml; + COUNTS = counts; + logml = logml - totalMuutos; + LOGDIFF = logdiff; + kokeiltu(round)=1; + end + end + clear partition; clear sumcounts; clear counts; clear poplogml; + + elseif round == 7 % Jaetaan populaatio kahtia ja yhdistetään + emptyPop = npops + 1; + j = 0; + pops = randperm(npops); + muutoksiaNyt = 0; + while (j < npops) + j = j +1; + pop = pops(j); + inds2 = find(PARTITION == pop); + ninds2 = length(inds2); + if ninds2 > 5 + partition = PARTITION; + sumcounts = SUMCOUNTS; + counts = COUNTS; + poplogml = POP_LOGML; + logdiff = LOGDIFF; + + % Luodaan väliaikaisesti uusi populaatio + npops = npops + 1; + POP_LOGML(npops) = 0; + COUNTS(:,:,npops) = zeros(size(COUNTS(:,:,1))); + SUMCOUNTS(npops,:) = zeros(size(SUMCOUNTS(1,:))); + + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = linkage(dist2'); + T2 = cluster_own(Z2, 2); + muuttuvat = inds2(find(T2 == 1)); + + muutokset = laskeMuutokset3(T2, inds2, rows, data, ... + adjprior, priorTerm, pop); + totalMuutos = muutokset(1, emptyPop); + + rivit = []; + for i = 1:length(muuttuvat) + ind = muuttuvat(i); + lisa = rows(ind,1):rows(ind,2); + rivit = [rivit lisa]; + end + diffInCounts = computeDiffInCounts(rivit, size(COUNTS,1), ... + size(COUNTS,2), data); + + updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, emptyPop); + + % Siirretään yksilöit?populaatioiden välill? + inds = randperm(ninds); + for ind = inds + i1 = PARTITION(ind); + if length(find(PARTITION==i1))>1 + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data, adjprior, priorTerm); + + [maxMuutos, i2] = max(muutokset); + if (i1~=i2 & maxMuutos>1e-5) + updateGlobalVariables(ind, i2, diffInCounts,... + adjprior, priorTerm); + totalMuutos = totalMuutos+maxMuutos; + end + end + end + + % Yhdistetään kaksi populaatiota, jos populaatioita + % liikaa + if length(find(any(SUMCOUNTS,2))) == npops + maxMuutos = -1e50; + for pop = 1:npops + [muutokset, diffInCounts] = laskeMuutokset2(pop, rows, ... + data, adjprior, priorTerm); + muutokset(pop)=-1e50; + [isoin, indeksi] = max(muutokset); + if isoin>maxMuutos + maxMuutos = isoin; + i1 = pop; + i2 = indeksi; + diffInCountsBest = diffInCounts; + end + end + updateGlobalVariables2(i1,i2, diffInCountsBest, ... + adjprior, priorTerm); + totalMuutos = totalMuutos + maxMuutos; + end + + if totalMuutos > 1e-5 + if dispText + disp('Action 7'); + end + muutoksia = 1; + % Poistetaan tyhj?populaatio + npops = poistaTyhjatPopulaatiot(npops); + POP_LOGML(abs(POP_LOGML)<1e-5) = []; + logml = logml + totalMuutos; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + if muutoksiaNyt == 0 + muutoksiaNyt = 1; + end + kokeiltu = zeros(nRoundTypes,1); + j = npops; + else + %palutetaan vanhat arvot + PARTITION = partition; + SUMCOUNTS = sumcounts; + COUNTS = counts; + POP_LOGML = poplogml; + LOGDIFF = logdiff; + npops = npops - 1; + end + + end + + end + + if muutoksiaNyt == 0 + kokeiltu(round)=1; + end + + end + end + + if muutoksia == 0 + if vaihe==1 + vaihe = 2; + elseif vaihe==2 + vaihe = 3; + elseif vaihe==3 + vaihe = 4; + elseif vaihe==4; + vaihe = 5; + elseif vaihe==5 + ready = 1; + end + else + muutoksia = 0; + end + + if ready==0 + if vaihe==1 + roundTypes=[1]; + elseif vaihe==2 + roundTypes=[2]; + elseif vaihe==3 + roundTypes=[6 7]; + elseif vaihe==4 + roundTypes=[4 3 1]; + elseif vaihe==5 + roundTypes=[6 7 2 3 4 1]; + end + end + end + + % TALLENNETAAN + + npops = poistaTyhjatPopulaatiot(npops); + POP_LOGML = computePopulationLogml(1:npops, adjprior, priorTerm); + if dispText + disp(['Found partition with ' num2str(npops) ' populations.']); + disp(['Log(ml) = ' num2str(logml)]); + disp(' '); + end + + %summary{run,1} =npops; + %summary{run,2} =POP_LOGML; + %summary{run,3} =logml; + %summary{run,4} =PARTITION; + %summary{run,5} = COUNTS; + + if logml>logmlBest + % Päivitetään parasta löydetty?partitiota. + logmlBest = logml; + npopsBest = npops; + partitionBest = PARTITION; + countsBest = COUNTS; + sumCountsBest = SUMCOUNTS; + pop_logmlBest = POP_LOGML; + logdiffbest = LOGDIFF; + end + +end + +logml = logmlBest; +npops = npopsBest; +PARTITION = partitionBest; +COUNTS = countsBest; +SUMCOUNTS = sumCountsBest; +POP_LOGML = pop_logmlBest; +LOGDIFF = logdiffbest; + +%-------------------------------------------------------------------------- + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; +global LOGDIFF; LOGDIFF = []; + +%-------------------------------------------------------------------------- + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + +%----------------------------------------------------------------------- + + +function [sumcounts, counts, logml] = ... + initialPopCounts(data, npops, rows, noalle, adjprior) + +nloci=size(data,2); +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); + +for i=1:npops + for j=1:nloci + i_rivit = rows(i,1):rows(i,2); + havainnotLokuksessa = find(data(i_rivit,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(i_rivit,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +logml = laskeLoggis(counts, sumcounts, adjprior); + +%----------------------------------------------------------------------- + + +function loggis = laskeLoggis(counts, sumcounts, adjprior) +npops = size(counts,3); + +logml2 = sum(sum(sum(gammaln(counts+repmat(adjprior,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior))) - ... + sum(sum(gammaln(1+sumcounts))); +loggis = logml2; + + +%------------------------------------------------------------------------------------ + + +function popLogml = computePopulationLogml(pops, adjprior, priorTerm) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS(pops,:)),2) - priorTerm; + +%-------------------------------------------------------------------------- + + +function [muutokset, diffInCounts] = ... + laskeMuutokset(ind, globalRows, data, adjprior, priorTerm) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?ind siirretään koriin i. +% diffInCounts on poistettava COUNTS:in siivusta i1 ja lisättäv? +% COUNTS:in siivuun i2, mikäli muutos toteutetaan. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +npops = size(COUNTS,3); +muutokset = LOGDIFF(ind,:); + +i1 = PARTITION(ind); +i1_logml = POP_LOGML(i1); +muutokset(i1) = 0; + +rows = globalRows(ind,1):globalRows(ind,2); +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = find(muutokset==-Inf); % Etsitään populaatiot jotka muuttuneet viime kerran jälkeen. +i2 = setdiff(i2,i1); +i2_logml = POP_LOGML(i2); + +ni2 = length(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 ni2]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[ni2 1]); +new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 ni2]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[ni2 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; +if length(muutokset)>npops + keyboard +end + +LOGDIFF(ind,1:npops) = muutokset(1:npops); + + + +%---------------------------------------------------------------------- + + +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n +% riveill?rows. rows pitää olla vaakavektori. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + +%------------------------------------------------------------------------ + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, diffInCounts, ... + adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun yksil?ind +% on siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +global LOGDIFF; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + + +%-------------------------------------------------------------------------- +%-- + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2( ... + i1, globalRows, data, adjprior, priorTerm); +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli korin i1 kaikki yksilöt siirretään +% koriin i. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +% muutokset = zeros(npops,1); +muutokset = ones(npops,1)*-Inf; + +i1_logml = POP_LOGML(i1); + +inds = find(PARTITION==i1); +ninds = length(inds); + +if ninds==0 + diffInCounts = zeros(size(COUNTS,1), size(COUNTS,2)); + return; +end + +rows = []; +for i = 1:ninds + ind = inds(i); + lisa = globalRows(ind,1):globalRows(ind,2); + rows = [rows; lisa']; + %rows = [rows; globalRows{ind}']; +end + +diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); +new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2( ... + i1, i2, diffInCounts, adjprior, priorTerm); +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksilöt siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +global LOGDIFF; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2, adjprior, priorTerm); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + + +%-------------------------------------------------------------------------- +%---- + +function muutokset = laskeMuutokset3(T2, inds2, globalRows, ... + data, adjprior, priorTerm, i1) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mik?olisi muutos logml:ss? jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirretään koriin j. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +i1_logml = POP_LOGML(i1); +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = []; + for i = 1:ninds + ind = inds(i); + lisa = globalRows(ind,1):globalRows(ind,2); + rows = [rows; lisa']; + %rows = [rows; globalRows{ind}']; + end + diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); + new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm)'; + COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + + muutokset(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + +%------------------------------------------------------------------------------------ + +function muutokset = laskeMuutokset5(inds, globalRows, data, adjprior, ... + priorTerm, i1, i2) + +% Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?i vaihtaisi koria i1:n ja i2:n välill? + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; + +ninds = length(inds); +muutokset = zeros(ninds,1); + +i1_logml = POP_LOGML(i1); +i2_logml = POP_LOGML(i2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %mist? + pop2 = i2; %mihin + else + pop1 = i2; + pop2 = i1; + end + rows = globalRows(ind,1):globalRows(ind,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)-diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts; + + new_logmls = computePopulationLogml([i1 i2], adjprior, priorTerm); + muutokset(i) = sum(new_logmls); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; +end + +muutokset = muutokset - i1_logml - i2_logml; + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, i2); +% Suorittaa globaalien muuttujien päivitykset, kun yksilöt 'muuttuvat' +% siirretään koriin i2. Ennen siirtoa yksilöiden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +global LOGDIFF; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + +%---------------------------------------------------------------------------- + + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäär? + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + + +%----------------------------------------------------------------------------------- + + +function npops = poistaTyhjatPopulaatiot(npops) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; +global LOGDIFF; + +notEmpty = find(any(SUMCOUNTS,2)); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); +LOGDIFF = LOGDIFF(:,notEmpty); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); + +%-------------------------------------------------------------- + +function num2 = omaRound(num) +% Pyöristää luvun num 1 desimaalin tarkkuuteen +num = num*10; +num = round(num); +num2 = num/10; + +%--------------------------------------------------------- +function mjono = logml2String(logml) +% Palauttaa logml:n string-esityksen. + +mjono = ' '; +if abs(logml)<10000 + %Ei tarvita e-muotoa + mjono(7) = palautaYks(abs(logml),-1); + mjono(6) = '.'; + mjono(5) = palautaYks(abs(logml),0); + mjono(4) = palautaYks(abs(logml),1); + mjono(3) = palautaYks(abs(logml),2); + mjono(2) = palautaYks(abs(logml),3); + pointer = 2; + while mjono(pointer)=='0' & pointer<7 + mjono(pointer) = ' '; + pointer=pointer+1; + end + if logml<0 + mjono(pointer-1) = '-'; + end +else + suurinYks = 4; + while abs(logml)/(10^(suurinYks+1)) >= 1 + suurinYks = suurinYks+1; + end + if suurinYks<10 + mjono(7) = num2str(suurinYks); + mjono(6) = 'e'; + mjono(5) = palautaYks(abs(logml),suurinYks-1); + mjono(4) = '.'; + mjono(3) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(2) = '-'; + end + elseif suurinYks>=10 + mjono(6:7) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(logml),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(1) = '-'; + end + end +end + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:in? +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmill? +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + + +%-------------------------------------------------------------------------- +%Seuraavat kolme funktiota liittyvat alkupartition muodostamiseen. + +function initial_partition=admixture_initialization(data_matrix,nclusters, Z) + +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +T=cluster_own(Z,nclusters); +initial_partition=zeros(size_data(1),1); +for i=1:n + kori=T(i); + here=find(data_matrix(:,end)==i); + for j=1:length(here) + initial_partition(here(j),1)=kori; + end +end + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + +%-------------------------------------------------------------------------- + +function [sumcounts, counts, logml] = ... + initialCounts(partition, data, npops, rows, noalle, adjprior) + +nloci=size(data,2); +ninds = size(rows, 1); + +%koot = rows(:,1) - rows(:,2) + 1; +%maxSize = max(koot); + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +%initializeGammaln(ninds, maxSize, max(noalle)); + +logml = laskeLoggis(counts, sumcounts, adjprior); + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + global PARTITION; + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + +%-------------------------------------------------------------------------- + +function inds = returnInOrder(inds, pop, globalRows, data, ... + adjprior, priorTerm) +% Palauttaa yksilöt järjestyksess?siten, ett?ensimmäisen?on +% se, jonka poistaminen populaatiosta pop nostaisi logml:n +% arvoa eniten. + +global COUNTS; global SUMCOUNTS; +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind =inds(i); + rows = globalRows(i,1):globalRows(i,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop) = COUNTS(:,:,pop)-diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)-diffInSumCounts; + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior, priorTerm); + COUNTS(:,:,pop) = COUNTS(:,:,pop)+diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)+diffInSumCounts; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhji? +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end diff --git a/matlab/independent/myxlsread.m b/matlab/independent/myxlsread.m new file mode 100644 index 0000000..06040ea --- /dev/null +++ b/matlab/independent/myxlsread.m @@ -0,0 +1,49 @@ +function [A B] = myxlsread(file) +% This function read a tab ('\t') seperated txt file +% input file structure: +% first row: title +% sencond to end row: first column, sample ID +% second column, cluster label +% other columns, gene sequences +% Lu Cheng +% 26.06.2010 + +% there can be multiple numeric columns in the input file +% Lu Cheng, 25.11.2010 + +delimiter = '\t'; + +if exist(file,'file')~=2 + error('The input file %s does not exist!', file); +end + +lines = textread(file,'%s','delimiter','\n'); + +title = strread(lines{1},'%s','delimiter',delimiter); +nRow = length(lines); +nCol = length(title); + +% determine numeric Columns +tmp = strread(lines{2},'%s','delimiter',delimiter); +numCols = []; +for i = 1:length(tmp) + if ~isnan(str2double(tmp{i})) + numCols(end+1) = i; %#ok + end +end + +A = cell(nRow-1, length(numCols)); +B = cell(nRow, nCol); + +B(1,:) = title; +for i=2:nRow + if isempty(lines{i}) + B(i,:) = []; + A(i-1,:) = []; + else + B(i,:) = strread(lines{i},'%s','delimiter',delimiter); + A(i-1,:) = B(i,numCols); + end +end + +A = cellfun(@str2double,A); \ No newline at end of file diff --git a/matlab/independent/myxlswrite.m b/matlab/independent/myxlswrite.m new file mode 100644 index 0000000..34aa408 --- /dev/null +++ b/matlab/independent/myxlswrite.m @@ -0,0 +1,19 @@ +function myxlswrite(file, A) +% A is a cell matrix, each element is a string +% Lu Cheng, 25.11.2010 + +h = fopen(file,'w+'); +[nRow nCol] = size(A); + +for i=1:nRow + %tmpLine = ''; + for j=1:nCol-1 + if isnumeric(A{i,j}) + A{i,j} = num2str(A{i,j}); + end + fprintf(h,'%s\t',A{i,j}); + end + fprintf(h,'%s\n',A{i,nCol}); +end + +fclose(h); \ No newline at end of file diff --git a/matlab/independent/preprocessXLS.m b/matlab/independent/preprocessXLS.m new file mode 100644 index 0000000..8b094ef --- /dev/null +++ b/matlab/independent/preprocessXLS.m @@ -0,0 +1,177 @@ +function c = preprocessXLS(xlsfile,varargin) +% This function preprocesses the input xlsfile +% File structure: first line - title +% else - first column, name of individuals +% - second to end column, sequences of the given genes + +% Lu Cheng, 16.02.2010 + +%% check file names + +file_suff = '.xls'; % endings of the input file +if ~(exist(xlsfile,'file')==2) + fprintf('Input file %s does not exists, quit!\n',xlsfile); + return; +end + +if ~strcmp(xlsfile(end-length(file_suff)+1:end),file_suff) + fprintf('Input file %s does not end with %s, quit!\n',xlsfile,file_suff); + return; +end + +%% process the xls file + +% Here we assume there is no missing values, if so, the missing values are +% indicated by 0 +[data, component_mat, popnames] = processxls(xlsfile); + +% missing data 0 is transformed to |alphabet|+1, in the [ACGT] case, '-' is 5 +[data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + +c.data = data; c.rowsFromInd = rowsFromInd; +c.alleleCodes = alleleCodes; c.noalle=noalle; +c.adjprior = adjprior; +% c.priorTerm = c.priorTerm; + +c.component_mat = component_mat; +c.popnames = popnames; + +%% count the cliques and separators + +index = data(:,end); + +if isempty(varargin) + [data_clique, data_separator, noalle_clique, noalle_separator, codes_cq, codes_sp, info_cq_loci, info_sp_loci] = ... + transform5(data, component_mat); +else + c_train = varargin{1}; + + if ~all(all(c_train.component_mat == component_mat)) + disp('The gene lengths are different between the training data and the test data!'); + return; + end + + [data_clique, data_separator, noalle_clique, noalle_separator, codes_cq, codes_sp, info_cq_loci, info_sp_loci] = ... + transform5(data, component_mat, c_train.info_cq_loci,c_train.info_sp_loci); +end +data_clique = [data_clique index]; +data_separator = [data_separator index]; + +% Count the data, note that the order of the alphabets keeps the same +[counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq] ... + = allfreqsnew2(data_clique, double(noalle_clique)); +[counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp] ... + = allfreqsnew2(data_separator, double(noalle_separator)); + +clear prior_cq prior_sp nalleles_cq nalleles_sp genotypes_cq genotypes_sp; + +counts_cq = uint16(counts_cq); +counts_sp = uint16(counts_sp); + +c.counts_cq = counts_cq; +c.counts_sp = counts_sp; + +c.adjprior_cq = adjprior_cq; +c.adjprior_sp = adjprior_sp; + +c.codes_cq = codes_cq; +c.codes_sp = codes_sp; + +c.info_cq_loci = info_cq_loci; +c.info_sp_loci = info_sp_loci; + +%-------------------------------------------------------------------------- +function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, miltä yksilöltä +% kyseinen rivi on peräisin. Funktio tutkii ensin, että montako +% riviä maksimissaan on peräisin yhdeltä yksilöltä, jolloin saadaan +% tietää onko kyseessä haploidi, diploidi jne... Tämän jälkeen funktio +% lisää tyhjiä rivejä niille yksilöille, joilta on peräisin vähemmän +% rivejä kuin maksimimäärä. +% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin +% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytössä oleva koodi. +% Tämän jälkeen funktio muuttaa alleelikoodit siten, että yhden lokuksen j +% koodit saavat arvoja välillä 1,...,noalle(j). + +% English Comments added +% Small modifications have been added +% Lu Cheng, 17.02.2010 + +% Last column are the indexes of the samples, the raw_data is supposed to +% be unit16 type, 0 indicates missing value +data = raw_data; +nloci=size(raw_data,2)-1; + +% Replace missing value with the |alphabet|+1, thus 0 is replaced by 5 for +% DNA dataset +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end + +% stores all different alleles at each loci, construct the allle codes matrix +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(logical(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +%-----------------modified by Lu Cheng 17.02.2010--------------------------% +% NOTE: Here we do not want to change the alpahbets, thus the following +% lines are commented + +% replace the index of an allele to replace the allele +% for loc = 1:nloci +% for all = 1:noalle(loc) +% data(logical(data(:,loc)==alleleCodes(all,loc)), loc)=all; +% end; +% end; +%-----------------modified end.....----------------------------------------% + +% handle diploid situation +nind = max(data(:,end)); +nrows = size(data,1); +ncols = size(data,2); +rowsFromInd = zeros(nind,1); +for i=1:nind + rowsFromInd(i) = length(find(data(:,end)==i)); +end +maxRowsFromInd = max(rowsFromInd); +a = -999; +emptyRow = repmat(a, 1, ncols); +lessThanMax = find(rowsFromInd < maxRowsFromInd); +missingRows = maxRowsFromInd*nind - nrows; +data = [data; zeros(missingRows, ncols)]; +pointer = 1; +for ind=lessThanMax' %Käy läpi ne yksilöt, joilta puuttuu rivejä + miss = maxRowsFromInd-rowsFromInd(ind); % Tältä yksilöltä puuttuvien lkm. + for j=1:miss + rowToBeAdded = emptyRow; + rowToBeAdded(end) = ind; + data(nrows+pointer, :) = rowToBeAdded; + pointer = pointer+1; + end +end +data = sortrows(data, ncols); % Sorttaa yksilöiden mukaisesti +newData = data; +rowsFromInd = maxRowsFromInd; + +% calculate the prior for each loci, priorTerm is a constant term in the +% formula, which is precalclulateed for speeding up the program +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end \ No newline at end of file diff --git a/matlab/independent/processxls.m b/matlab/independent/processxls.m new file mode 100644 index 0000000..be7a375 --- /dev/null +++ b/matlab/independent/processxls.m @@ -0,0 +1,81 @@ +function [data, component_mat, popnames] = processxls(filename) +% +% a bug in line 64-68 was fixed +data = []; +component_mat = []; +popnames = []; +try + if ispc + [A,B] = xlsread(filename); + else + [A,B] = myxlsread(filename); + end +catch + display('*** ERROR: Wrong Excel format'); + return +end + +if size(A,2)~=1 % more than one columns containing numeric ST values + display('*** ERROR: multiple columns of numeric values'); + data = []; component_mat = []; popnames = []; + return +end + +if size(A,1)~=size(B,1)-1 + display('*** ERROR: Wrong format'); + data = []; component_mat = []; popnames = []; + return +end + +B = deblank(B); % remove any trailing blanks +nstrains = size(B,1)-1; +nheader = size(B,2); +for i = 1:nheader + if strcmpi('ST',B{1,i}) ix_ST = i; end + if strcmpi('Strain', B{1,i}) || strcmpi('Isolate',B{1,i}) + ix_Strain = i; + end +end +if ~exist('ix_ST') + display('*** ERROR: ST column needed'); + data = []; component_mat = []; popnames = []; + return +end + +if ~exist('ix_Strain') + ix_gene = setdiff([1:nheader],ix_ST); +else + ix_gene = setdiff([1:nheader],[ix_ST ix_Strain]); +end + +ngenes = length(ix_gene); + +C = cell(nstrains,ngenes); +if ~isempty(A) + for i=1:nstrains + B{i+1,ix_ST}=num2str(A(i)); + for j=1:ngenes + C{i,j}=uint16(i_encode_n(B{i+1,ix_gene(j)})); % save the memory. + end + end +end +genesize=cellfun('size',C(1,:),2); +data=cell2mat(C); +data=[data uint16([1:nstrains]')]; +component_mat = zeros(ngenes,max(genesize)); +cum = cumsum(genesize); +component_mat(1,[1:genesize(1)]) = [1:cum(1)]; +for i=2:ngenes + component_mat(i,[1:genesize(i)]) = [(cum(i-1)+1):cum(i)]; +end + +if ~exist('ix_Strain') + popnames = num2cell(B([2:end],ix_ST)); +else % store the strain names only + popnames = num2cell(B([2:end],ix_Strain)); +end +popnames(:,2)=num2cell([1:nstrains]'); + +display('---------------------------------------------------'); +display(['# of strains: ', num2str(nstrains)]); +display(['# of genes: ', num2str(ngenes)]); \ No newline at end of file diff --git a/matlab/independent/semiReadScript.m b/matlab/independent/semiReadScript.m new file mode 100644 index 0000000..29ae812 --- /dev/null +++ b/matlab/independent/semiReadScript.m @@ -0,0 +1,128 @@ +function paras = semiReadScript(script_file) +% This function extracts parameter information from the script file +% Script Command Table +% datafile('train|test','c:\BAPS5\DATA.xls'); Here only .xls and .mat file +% input file is supported. +% savePreproFile('train|test','c:\BAPS5\predata.mat'); +% setK('16 17 18'); +% outputmat('c:\BAPS5\output.mat') +% Lu Cheng, 11.03.2010 + +paras.train_file_format = []; +paras.train_file_name = []; + +paras.save_prepro_train_data = []; paras.save_prepro_train_data = 'No'; +paras.train_prepro_file = []; + +paras.test_file_format = []; +paras.test_file_name = []; + +paras.save_prepro_test_data = []; paras.save_prepro_test_data = 'No'; +paras.test_prepro_file = []; + +paras.cluster_num_upperbounds = []; + +paras.save_results = []; paras.save_results = 'No'; +paras.result_file = []; + +T = readfile(script_file); + +n = length(T); +for i=1:n + %line = regexprep(T{i},'\s+',''); + line = T{i}; + [res toks] = regexp(line,'(.+)\((.+)\)','once','match','tokens'); + + if isempty(res) + continue; + else + %toks + paras = parseCmd(toks{1}, toks{2}, paras); + end +end + +% ------------------------------------------------------------------------- +function prog_paras = parseCmd(cmd, paras, prog_paras) +% cmd is the script command +% paras are the parameters of the script command +% prog_paras is a stucture of the global parameters + +switch cmd + case 'datafile' + paras = regexprep(paras,'\s+',''); + toks = regexp(paras,'''([^,]+)''','tokens'); + option = toks{1}{:}; + filename = toks{2}{:}; + if exist(filename,'file')~=2 + error(cat(2,'File not exist! File: ',filename)); + end + filetype = getFileType(filename); + if isequal(option,'train') + prog_paras.train_file_format = filetype; + prog_paras.train_file_name = filename; + elseif isequal(option,'test') + prog_paras.test_file_format = filetype; + prog_paras.test_file_name = filename; + else + error(cat(2,'Unkown option: ',option,'! Expect train or test.')); + end + + case 'savePreprocFile' + paras = regexprep(paras,'\s+',''); + toks = regexp(paras,'''([^,]+)''','tokens'); + option = toks{1}{:}; + filename = toks{2}{:}; + + filetype = getFileType(filename); + if ~isequal(filetype,'.mat') + error(cat(2,'The saved file should end with .mat! ',filename)); + end + + if isequal(option,'train') + prog_paras.save_prepro_train_data = 'Yes'; + prog_paras.train_prepro_file = filename; + elseif isequal(option,'test') + prog_paras.save_prepro_test_data = 'Yes'; + prog_paras.test_prepro_file = filename; + else + error(cat(2,'Unkown option: ',option,'! Expect train or test.')); + end + case 'setK' + prog_paras.cluster_num_upperbounds = paras(2:end-1); + case 'outputmat' + filename = paras(2:end-1); + filetype = getFileType(filename); + if ~isequal(filetype,'.mat') + error(cat(2,'The saved file should end with .mat! ',filename)); + end + prog_paras.save_results = 'Yes'; + prog_paras.result_file = filename; + otherwise + error('Can not parse the cmd: %s in the script!', cmd); +end + +% ------------------------------------------------------------------------- +function filetype = getFileType(filename) +filetype = filename(end-3:end); +if ~isequal(filetype,'.xls') && ~isequal(filetype,'.mat') + error(cat(2,'Unknown option: ', filename, '! Expect .xls or .mat file')); +end + +% ------------------------------------------------------------------------- +function T = readfile(filename) +f = fopen(filename,'r'); +if f == -1 + error(cat(2,'*** ERROR: invalid input file: ',filename)); + T = []; + return +end + +i = 1; +while 1 + clear line; + line = fgetl(f); + if ~ischar(line), break, end + T{i} = line; + i = i+1; +end +fclose(f); \ No newline at end of file diff --git a/matlab/independent/semi_linkageMix.m b/matlab/independent/semi_linkageMix.m new file mode 100644 index 0000000..639301b --- /dev/null +++ b/matlab/independent/semi_linkageMix.m @@ -0,0 +1,1470 @@ +function c_result = semi_linkageMix(c_train, c_test, npopsTable) +% Greedy search algorithm with unknown number of classes for linkage +% clustering. + +% Modified from linkageMix.m by Lu Cheng, 16.02.2010 +% Input: preprocessed training data and test data, as well as the +% population number, npopsTable is a vector +% NOTE, here we assume that c_train contains all the correct priors + +% IS_SINGLE_SAMPLE is introduced to handle the case of only one test sample +% Lu Cheng, 07.03.2011 + +global POP_LOGML; global PARTITION; +global CQ_COUNTS; global SP_COUNTS; %These counts are for populations +global CQ_SUMCOUNTS; global SP_SUMCOUNTS; %not for individuals +global LOGDIFF; + +global TRAIN_CQ_COUNTS; +global TRAIN_SP_COUNTS; +global TRAIN_CQ_SUMCOUNTS; +global TRAIN_SP_SUMCOUNTS; + +clearGlobalVars; + +% noalle = c_train.noalle; +% adjprior = c_train.adjprior; %priorTerm = c.priorTerm; +% rowsFromInd = c_train.rowsFromInd; +adjprior_cq = c_train.adjprior_cq; +adjprior_sp = c_train.adjprior_sp; + +counts_cq = c_test.counts_cq; +counts_sp = c_test.counts_sp; + +% UPDATE the priors by the following constants from the training data +% Lu Cheng, 17.02.2010 ------------------------ % +clusters = unique(c_train.cluster_labels); +n_train_clusters = length(clusters); +if ~all(clusters'==1:n_train_clusters) %labels should cover 1 to n_train_clusters + error(cat(2,'Error! Cluster labels should range from 1 to ',n_train_clusters,'!')); +end +[d1_cq d2_cq d3_cq] = size(c_train.counts_cq); +[d1_sp d2_sp d3_sp] = size(c_train.counts_sp); +TRAIN_CQ_COUNTS = zeros(d1_cq,d2_cq,n_train_clusters); +TRAIN_SP_COUNTS = zeros(d1_sp,d2_sp,n_train_clusters); +for i = 1:n_train_clusters + cluster_inds = (c_train.cluster_labels == clusters(i)); + TRAIN_CQ_COUNTS(:,:,i) = sum(c_train.counts_cq(:,:,cluster_inds),3); + TRAIN_SP_COUNTS(:,:,i) = sum(c_train.counts_sp(:,:,cluster_inds),3); +end + +train_counts_cq = TRAIN_CQ_COUNTS; +train_counts_sp = TRAIN_SP_COUNTS; + +clear clusters cluster_inds i +clear d1_cq d2_cq d3_cq d1_sp d2_sp d3_sp cluster_inds +% --------------------------------------------- % + +IS_SINGLE_SAMPLE = false; +if isfield(c_test,'dist') + dist = c_test.dist; Z = c_test.Z; +else + IS_SINGLE_SAMPLE = true; +end +clear c_test c_train; + +nruns = length(npopsTable); +logmlBest = -1e50; +partitionSummary = -1e50*ones(100,2); % 100 best partitions (npops and logml) +partitionSummary(:,1) = zeros(100,1); +worstLogml = -1e50; worstIndex = 1; + +for run = 1:nruns + if IS_SINGLE_SAMPLE && (npopsTable(run)>n_train_clusters+1) + npops = n_train_clusters + 1; + else + npops = npopsTable(run); + end + + dispLine; + disp(['Run ' num2str(run) '/' num2str(nruns) ... + ', maximum number of populations ' num2str(npops) '.']); + + %----- added by Lu Cheng, 23.02.2010------% + % modify the training data so it fits the dimension of given population + % number + %n_train_clusters = size(train_counts_cq,3); + if npopsworstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex, n_train_clusters); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + + %%%% + % Finding the best partition with the greedy search algorithm + %%%% + nRoundTypes = 7; + tested = zeros(nRoundTypes,1); + roundTypes = [1 1]; + ready = 0; phase = 1; + ninds = length(PARTITION); % number of individuals + LOGDIFF = repmat(-Inf,ninds,npops); + + disp(' '); + disp(['Mixture analysis started with initial ' num2str(npops) ' populations.']); + + while ready ~= 1 + changesMade = 0; + + disp(['Performing steps: ' num2str(roundTypes)]); + + for n = 1:length(roundTypes) + round = roundTypes(n); + % pack; + if tested(round) == 1 + + elseif round==1 % Moving one individual to another population + inds = randperm(ninds); % random order + changesMadeNow = 0; + for ind = inds + i1 = PARTITION(ind); + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + changesInLogml = computeChanges(ind, adjprior_cq, ... + adjprior_sp, indCqCounts, indSpCounts); + + [maxChange, i2] = max(changesInLogml); + + if (i1~=i2 && maxChange>1e-5) + + %disp(changesInLogml); + + % Individual is moved + changesMade = 1; + if changesMadeNow == 0 + disp('action 1'); + changesMadeNow = 1; + tested = zeros(nRoundTypes,1); + end + updateGlobalVariables(ind, i2, indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + logml = logml+maxChange; + + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex, n_train_clusters); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + end + end + + if changesMadeNow == 0 + tested(round) = 1; + end + + + %disp(PARTITION'); % for test + + elseif round==2 % Combining two populations + maxChange = 0; + + %n_train_clusters % see line 32, Lu Cheng, 04.03.2010 + +% for pop = 1:npops +% changesInLogml = computeChanges2(pop, adjprior_cq, adjprior_sp); % all inds in 'pop' is moved to other clusters +% [biggest, index] = max(changesInLogml); +% if biggest>maxChange +% maxChange = biggest; +% i1 = pop; +% i2 = index; +% fprintf('moving population %d to population %d.\n',i1,i2); +% end +% disp(changesInLogml'); +% end + + % modified by Lu Cheng, 04.03.2010 + % here we only combine the outer clusters with other + % outerclusters or training clusters. In case there is only + % one outer cluster in the PARTITION, it will be combined with + % some training cluster, depending on the logml + if npops>n_train_clusters + for pop = n_train_clusters+1:npops + if any(PARTITION==pop) + changesInLogml = computeChanges2(pop, adjprior_cq, adjprior_sp); % all inds in 'pop' is moved to other clusters + [biggest, index] = max(changesInLogml); + if biggest>maxChange + maxChange = biggest; + i1 = pop; + i2 = index; + %fprintf('moving population %d to population %d.\n',i1,i2); + end + end + end + + %fprintf('maxChange: %d\n',maxChange); + end + + + if maxChange>1e-5 + disp('action 2'); + changesMade = 1; + tested = zeros(nRoundTypes,1); + + updateGlobalVariables2(i1, i2, adjprior_cq, adjprior_sp); %all inds in i1 are moved to i2 + logml = logml + maxChange; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex, n_train_clusters); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + + %disp(PARTITION'); % for test + + else + tested(round) = 1; + end + + + elseif (round==3 || round==4) && ~IS_SINGLE_SAMPLE % Splitting population into smaller groups + maxChange = 0; + + for pop = 1:npops + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2>5 + % Computing the distance between individuals inds2 + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = computeLinkage(dist2'); + + % Number of groups: + if round==3 + npops2 = min(20, floor(ninds2 / 5)); + elseif round==4 + npops2 = 2; + end + T2 = cluster_own(Z2, npops2); + + changesInLogml = computeChanges3(T2, inds2, pop, ... + counts_cq, counts_sp, adjprior_cq, adjprior_sp); % npops2 * npops matrix + [biggest, index] = max(changesInLogml(1:end)); + if biggest > maxChange + maxChange = biggest; + movingGroup = rem(index,npops2); % The group, which is moved + if movingGroup==0, movingGroup = npops2; end + movingInds = inds2(logical(T2==movingGroup)); + i2 = ceil(index/npops2); % pop where movingGroup would be moved + end + end + end + if maxChange>1e-5 + changesMade = 1; + tested = zeros(nRoundTypes,1); + if round==3 + disp('action 3'); + else + disp('action 4'); + end + indCqCounts = uint16(sum(counts_cq(:,:,movingInds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,movingInds),3)); + updateGlobalVariables3(movingInds, i2,indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + logml = logml + maxChange; + + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex, n_train_clusters); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + else + tested(round) = 1; + end + + elseif (round == 5 || round == 6) && ~IS_SINGLE_SAMPLE + %Moving individuals out of population until positive change + %in logml has occured + pop=0; + changesMadeNow = 0; + %Saving old values + poplogml = POP_LOGML; + partition = PARTITION; + cq_counts = CQ_COUNTS; + sp_counts = SP_COUNTS; + cq_sumcounts = CQ_SUMCOUNTS; + sp_sumcounts = SP_SUMCOUNTS; + logdiff = LOGDIFF; + + while (pop < npops && changesMadeNow == 0) + pop = pop+1; + totalChangeInLogml = 0; + inds = find(PARTITION==pop); + if round == 5 + %Random order + aputaulu = [inds rand(length(inds),1)]; + aputaulu = sortrows(aputaulu,2); + inds = aputaulu(:,1)'; + elseif round == 6 + inds = returnInOrder(inds, pop, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp); + end + + i=0; + + while (length(inds)>0 && i 1e-5 + i=length(inds); + end + end + end + + if totalChangeInLogml>1e-5 + if round == 5 + disp('action 5'); + elseif round == 6 + disp('action 6'); + end + tested = zeros(nRoundTypes,1); + changesMadeNow=1; + changesMade = 1; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex, n_train_clusters); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + else + % No better partition was found, restoring the old + % values + PARTITION = partition; + POP_LOGML = poplogml; + CQ_COUNTS = cq_counts; + SP_COUNTS = sp_counts; + CQ_SUMCOUNTS = cq_sumcounts; + SP_SUMCOUNTS = sp_sumcounts; + LOGDIFF = logdiff; + logml = logml - totalChangeInLogml; + end + end + clear partition; clear poplogml; + if changesMadeNow == 0 + tested(round) = 1; + end + + elseif round == 7 && ~IS_SINGLE_SAMPLE + emptyPop = findEmptyPop(npops); + j = 0; + pops = randperm(npops); + % totalChangeInLogml = 0; + if emptyPop == -1 + j = npops; + end + changesMadeNow = 0; + while (j < npops) + j = j +1; + pop = pops(j); + inds2 = find(PARTITION == pop); + ninds2 = length(inds2); + if ninds2 > 5 + partition = PARTITION; + cq_sumcounts = CQ_SUMCOUNTS; + cq_counts = CQ_COUNTS; + sp_sumcounts = SP_SUMCOUNTS; + sp_counts = SP_COUNTS; + poplogml = POP_LOGML; + logdiff = LOGDIFF; + % pack; + + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = computeLinkage(dist2'); + T2 = cluster_own(Z2, 2); + % movingInds = inds2(find(T2 == 1)); + movingInds = inds2(logical(T2 == 1)); + changesInLogml = computeChanges3(T2, inds2, pop, ... + counts_cq, counts_sp, adjprior_cq, adjprior_sp); + totalChangeInLogml = changesInLogml(1, emptyPop); + + indCqCounts = uint16(sum(counts_cq(:,:,movingInds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,movingInds),3)); + updateGlobalVariables3(movingInds, emptyPop,indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + + changed = 1; + + while (changed == 1) + changed = 0; + + changesInLogml = computeChanges5(inds2, pop, emptyPop, ... + counts_cq, counts_sp, adjprior_cq, adjprior_sp); + [maxChange, index] = max(changesInLogml); + moving = inds2(index); + if (PARTITION(moving) == pop) + i2 = emptyPop; + else + i2 = pop; + end + + if maxChange > 1e-5 + indCqCounts = uint16(counts_cq(:,:,moving)); + indSpCounts = uint16(counts_sp(:,:,moving)); + updateGlobalVariables3(moving, i2,indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + changed = 1; + totalChangeInLogml = totalChangeInLogml + maxChange; + end + end + + if totalChangeInLogml > 1e-5 + changesMade = 1; + changesMadeNow = 1; + logml = logml + totalChangeInLogml; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex, n_train_clusters); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + disp('action 7'); + tested = zeros(nRoundTypes, 1); + j = npops; + else + % No better partition was found, restoring the old + % values + PARTITION = partition; + POP_LOGML = poplogml; + CQ_COUNTS = cq_counts; + SP_COUNTS = sp_counts; + CQ_SUMCOUNTS = cq_sumcounts; + SP_SUMCOUNTS = sp_sumcounts; + LOGDIFF = logdiff; + %logml = logml - totalChangeInLogml; + end + end + end + if changesMadeNow == 0 + tested(round) = 1; + end + end + + end + + + if changesMade == 0 + if phase==1 + phase = 2; + elseif phase==2 + phase = 3; + elseif phase==3 + phase = 4; + elseif phase==4; + phase = 5; + elseif phase==5 + ready = 1; + end + else + changesMade = 0; + end + + if ready==0 + if phase==1 + roundTypes=[1]; + elseif phase==2 + roundTypes=[2]; + elseif phase==3 + roundTypes=[5 5 7]; + elseif phase==4 + roundTypes=[4 3 1 1]; + elseif phase==5 + roundTypes=[6 2 7 3 4 1]; + end + end + + end + % Saving results + + npops = removeEmptyPops(n_train_clusters,npops); + POP_LOGML = computePopulationLogml(1:npops, adjprior_cq, adjprior_sp); + + disp(['Found partition with ' num2str(npops) ' populations.']); + disp(['Log(ml) = ' num2str(logml)]); + disp(' '); + + if logml>logmlBest + % Updating the best found partition + logmlBest = logml; + npopsBest = npops; + partitionBest = PARTITION; + cq_countsBest = CQ_COUNTS; + sp_countsBest = SP_COUNTS; + cq_sumcountsBest = CQ_SUMCOUNTS; + sp_sumcountsBest = SP_SUMCOUNTS; + pop_logmlBest = POP_LOGML; + logdiffbest = LOGDIFF; + end +end + +c_result.logml = logmlBest; +c_result.npops = npopsBest; +c_result.PARTITION = partitionBest; +c_result.CQ_COUNTS = cq_countsBest; +c_result.SP_COUNTS = sp_countsBest; +c_result.CQ_SUMCOUNTS = cq_sumcountsBest; +c_result.SP_SUMCOUNTS = sp_sumcountsBest; +c_result.POP_LOGML = pop_logmlBest; +c_result.LOGDIFF = logdiffbest; +c_result.partitionSummary = partitionSummary; + +% Calculate the posterior probabilities if a sample i is moved to cluster +% j, from LOGDIFF (nsample*npops matrix) +% Each row of LOGDIFF represents logml changes if the sample is moved to +% the corresponding cluster +% See page 24 in BAPS manual +% added by Lu Cheng, 29.03.2010 +[ninds npops] = size(c_result.LOGDIFF); +clusterProbTable = zeros(ninds, npops); +for i=1:ninds + tmpRow = exp(c_result.LOGDIFF(i,:)); + clusterProbTable(i,:) = tmpRow/sum(tmpRow); +end +c_result.clusterProbTable = clusterProbTable; +% ------------ + +%-------------------------------------------------------------------------- +% The next three functions are for computing the initial partition +% according to the distance between the individuals + +function initial_partition = admixture_initialization(nclusters,Z) +initial_partition = cluster_own(Z,nclusters); + +%-------------------------------------------------------------------------- +function T = cluster_own(Z,nclust) +% true=logical(1); +% false=logical(0); + +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); +% maximum number of clusters based on inconsistency +if m <= maxclust + T = (1:m)'; +elseif maxclust==1 + T = ones(m,1); +else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end +end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + +%-------------------------------------------------------------------------- + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäärä. + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + +%-------------------------------------------------------------------------- + +function Z = computeLinkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 || m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +% monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + +%-------------------------------------------------------------------------- + +function changes = computeChanges(ind, adjprior_cq, adjprior_sp, ... + indCqCounts, indSpCounts) +% Computes changes in log-marginal likelihood if individual ind is +% moved to another population +% +% Input: +% ind - the individual to be moved +% adjprior_cq & _sp - adjpriors for cliques and separators +% indCqCounts, indSpCounts - counts for individual ind +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% ind is move to population i. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +%npops = size(CQ_COUNTS,3); deleted by Lu Cheng, 24.20.2010 +changes = LOGDIFF(ind,:); + +i1 = PARTITION(ind); +i1_logml = POP_LOGML(i1); +changes(i1) = 0; + +sumCq = uint16(sum(indCqCounts,1)); %n_loci * 1 vector +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +new_i1_logml = computePopulationLogml(i1, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)+indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)+sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)+indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)+sumSp; + +i2 = find(changes==-Inf); +% i2 = setdiff(i2,i1); % deleted by Lu Cheng, 24.02.2010, since changes(i1) = 0 +i2_logml = POP_LOGML(i2); + +ni2 = length(i2); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 ni2]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[ni2 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 ni2]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:) + repmat(sumSp,[ni2 1]); + +new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 ni2]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[ni2 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 ni2]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:) - repmat(sumSp,[ni2 1]); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; +LOGDIFF(ind,:) = changes; + +%------------------------------------------------------------------------------------ + +function changes = computeChanges2(i1, adjprior_cq, adjprior_sp) +% Computes changes in log marginal likelihood if population i1 is combined +% with another population +% +% Input: +% i1 - the population to be combined +% adjprior_cq & _sp - adjpriors for cliques and separators +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% i1 is combined with population i. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; + +npops = size(CQ_COUNTS,3); +changes = zeros(npops,1); + +i1_logml = POP_LOGML(i1); +indCqCounts = CQ_COUNTS(:,:,i1); +indSpCounts = SP_COUNTS(:,:,i1); +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +new_i1_logml = 0; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ repmat(sumSp,[npops-1 1]); +% a = repmat(sumSp,[npops-1 1]); +% if ~any(sumSp) +% a(:,[1:size(a,2)])=[]; +% end +% SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ a ; + + +new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)- repmat(sumSp,[npops-1 1]); + +changes(i2) = new_i1_logml - i1_logml + new_i2_logml - i2_logml; + +%------------------------------------------------------------------------------------ + + +function changes = computeChanges3(T2, inds2, i1, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Computes changes in log marginal likelihood if subpopulation of i2 is +% moved to another population +% +% Input: +% T2 - partition of inds2 to subpopulations +% inds2 - individuals in population i1 +% i2 +% counts_cq, counts_sp - counts for individuals +% +% Output: +% changes - table of size length(unique(T2))*npops. +% changes(i,j) = difference in logml if subpopulation inds2(find(T2==i)) of +% i2 is moved to population j + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; + +npops = size(CQ_COUNTS,3); +npops2 = length(unique(T2)); +changes = zeros(npops2,npops); + +%cq_counts = CQ_COUNTS; +%sp_counts = SP_COUNTS; +%cq_sumcounts = CQ_SUMCOUNTS; +%sp_sumcounts = SP_SUMCOUNTS; + + +i1_logml = POP_LOGML(i1); + +for pop2 = 1:npops2 + % inds = inds2(find(T2==pop2)); + inds = inds2(T2==pop2); + ninds = length(inds); + if ninds>0 + indCqCounts = uint16(sum(counts_cq(:,:,inds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,inds),3)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; + CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; + SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + + new_i1_logml = computePopulationLogml(i1, adjprior_cq, adjprior_sp); + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)+indCqCounts; + CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)+sumCq; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)+indSpCounts; + SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)+sumSp; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); + CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); + SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ repmat(sumSp,[npops-1 1]); + + new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp)'; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); + CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); + SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)- repmat(sumSp,[npops-1 1]); + + changes(pop2,i2) = new_i1_logml - i1_logml + new_i2_logml - i2_logml; + end +end + +%-------------------------------------------------------------------------- + +function changes = computeChanges5(inds, i1, i2, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Computes change in logml if individual of inds is moved between +% populations i1 and i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; global PARTITION; + +ninds = length(inds); +changes = zeros(ninds,1); + +i1_logml = POP_LOGML(i1); +i2_logml = POP_LOGML(i2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %from + pop2 = i2; %to + else + pop1 = i2; + pop2 = i1; + end + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,pop1) = CQ_COUNTS(:,:,pop1)-indCqCounts; + CQ_SUMCOUNTS(pop1,:) = CQ_SUMCOUNTS(pop1,:)-sumCq; + SP_COUNTS(:,:,pop1) = SP_COUNTS(:,:,pop1)-indSpCounts; + SP_SUMCOUNTS(pop1,:) = SP_SUMCOUNTS(pop1,:) - sumSp; + + CQ_COUNTS(:,:,pop2) = CQ_COUNTS(:,:,pop2)+indCqCounts; + CQ_SUMCOUNTS(pop2,:) = CQ_SUMCOUNTS(pop2,:)+sumCq; + SP_COUNTS(:,:,pop2) = SP_COUNTS(:,:,pop2)+indSpCounts; + SP_SUMCOUNTS(pop2,:) = SP_SUMCOUNTS(pop2,:) + sumSp; + + new_logmls = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + changes(i) = sum(new_logmls); + + CQ_COUNTS(:,:,pop1) = CQ_COUNTS(:,:,pop1)+indCqCounts; + CQ_SUMCOUNTS(pop1,:) = CQ_SUMCOUNTS(pop1,:)+sumCq; + SP_COUNTS(:,:,pop1) = SP_COUNTS(:,:,pop1)+indSpCounts; + SP_SUMCOUNTS(pop1,:) = SP_SUMCOUNTS(pop1,:)+sumSp; + CQ_COUNTS(:,:,pop2) = CQ_COUNTS(:,:,pop2)-indCqCounts; + CQ_SUMCOUNTS(pop2,:) = CQ_SUMCOUNTS(pop2,:)-sumCq; + SP_COUNTS(:,:,pop2) = SP_COUNTS(:,:,pop2)-indSpCounts; + SP_SUMCOUNTS(pop2,:) = SP_SUMCOUNTS(pop2,:)-sumSp; +end + +changes = changes - i1_logml - i2_logml; + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, indCqCounts, indSpCounts, ... + adjprior_cq, adjprior_sp) +% Updates global variables when individual ind is moved to population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+indCqCounts; +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+sumCq; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+indSpCounts; +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+sumSp; + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + +LOGDIFF(:,[i1 i2]) = -Inf; +%inx = [find(PARTITION==i1); find(PARTITION==i2)]; +inx = (PARTITION==i1 | PARTITION==i2); % modified by Lu Cheng, 24.02.2010 +LOGDIFF(inx,:) = -Inf; + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2(i1, i2, adjprior_cq, adjprior_sp) +% Updates global variables when all individuals from population i1 are moved +% to population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +% inds = find(PARTITION==i1); +% PARTITION(inds) = i2; +PARTITION(PARTITION==i1) = i2; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+CQ_COUNTS(:,:,i1); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+CQ_SUMCOUNTS(i1,:); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+SP_COUNTS(:,:,i1); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+SP_SUMCOUNTS(i1,:); + +CQ_COUNTS(:,:,i1) = 0; +CQ_SUMCOUNTS(i1,:) = 0; +SP_COUNTS(:,:,i1) = 0; +SP_SUMCOUNTS(i1,:) = 0; + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +LOGDIFF(:,[i1 i2]) = -Inf; +%inx = [find(PARTITION==i1); find(PARTITION==i2)]; +inx = (PARTITION==i1 | PARTITION==i2); % modified by Lu Cheng, 24.02.2010 +LOGDIFF(inx,:) = -Inf; + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, i2, indCqCounts, indSpCounts, ... + adjprior_cq, adjprior_sp) +% Updates global variables when individuals muuttuvat are moved to +% population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +i1 = PARTITION(muuttuvat(1)); %Why only one individual is moved? Lu Cheng, 24.02.2010 +PARTITION(muuttuvat) = i2; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+indCqCounts; +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+sumCq; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+indSpCounts; +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+sumSp; + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Returns individuals inds in order according to the change in the logml if +% they are moved out of the population pop + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind = inds(i); + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,pop) = CQ_COUNTS(:,:,pop)-indCqCounts; + CQ_SUMCOUNTS(pop,:) = CQ_SUMCOUNTS(pop,:)-sumCq; + SP_COUNTS(:,:,pop) = SP_COUNTS(:,:,pop)-indSpCounts; + SP_SUMCOUNTS(pop,:) = SP_SUMCOUNTS(pop,:)-sumSp; + + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior_cq, adjprior_sp); + + CQ_COUNTS(:,:,pop) = CQ_COUNTS(:,:,pop)+indCqCounts; + CQ_SUMCOUNTS(pop,:) = CQ_SUMCOUNTS(pop,:)+sumCq; + SP_COUNTS(:,:,pop) = SP_COUNTS(:,:,pop)+indSpCounts; + SP_SUMCOUNTS(pop,:) = SP_SUMCOUNTS(pop,:)+sumSp; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + + +%-------------------------------------------------------------------------- + +function clearGlobalVars + +global CQ_COUNTS; CQ_COUNTS = []; +global CQ_SUMCOUNTS; CQ_SUMCOUNTS = []; +global SP_COUNTS; SP_COUNTS = []; +global SP_SUMCOUNTS; SP_SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; +global LOGDIFF; LOGDIFF = []; + +global TRAIN_CQ_COUNTS; TRAIN_CQ_COUNTS=[]; +global TRAIN_CQ_SUMCOUNTS; TRAIN_CQ_SUMCOUNTS = []; +global TRAIN_SP_COUNTS; TRAIN_SP_COUNTS = []; +global TRAIN_SP_SUMCOUNTS; TRAIN_SP_SUMCOUNTS = []; + + +%-------------------------------------------------------------------------- + +function new_npops = removeEmptyPops(n_train_clusters, npops) +% Removes empty pops from all global COUNTS variables. +% Updates PARTITION and npops + +% This function is modified by Lu Cheng, 08.03.2010 +% n_train_clusters: number of clusters in the training data +% npops: the user defined max potential population number + +global CQ_COUNTS; +global CQ_SUMCOUNTS; +global SP_COUNTS; +global SP_SUMCOUNTS; +global PARTITION; +global LOGDIFF; + +global TRAIN_CQ_COUNTS; +global TRAIN_CQ_SUMCOUNTS; +global TRAIN_SP_COUNTS; +global TRAIN_SP_SUMCOUNTS; + +%---------ORIGINAL CODES START------------------% +% notEmpty = find(any(CQ_SUMCOUNTS,2)); +% CQ_COUNTS = CQ_COUNTS(:,:,notEmpty); +% CQ_SUMCOUNTS = CQ_SUMCOUNTS(notEmpty,:); +% SP_COUNTS = SP_COUNTS(:,:,notEmpty); +% SP_SUMCOUNTS = SP_SUMCOUNTS(notEmpty,:); +% LOGDIFF = LOGDIFF(:,notEmpty); +% +% for n=1:length(notEmpty) +% PARTITION(PARTITION==notEmpty(n)) = n; +% end +% npops = length(notEmpty); +%---------ORIGINAL CODES END---------------------% + +pops = unique(PARTITION); +pops = pops(pops>n_train_clusters); +if isempty(pops) && npops>n_train_clusters + CQ_COUNTS(:,:,n_train_clusters+1:npops) = []; + CQ_SUMCOUNTS(n_train_clusters+1:npops,:) = []; + SP_COUNTS(:,:,n_train_clusters+1:npops) = []; + SP_SUMCOUNTS(n_train_clusters+1:npops,:) = []; + + TRAIN_CQ_COUNTS(:,:,n_train_clusters+1:npops) = []; + TRAIN_CQ_SUMCOUNTS(n_train_clusters+1:npops,:) = []; + TRAIN_SP_COUNTS(:,:,n_train_clusters+1:npops) = []; + TRAIN_SP_SUMCOUNTS(n_train_clusters+1:npops,:) = []; + + LOGDIFF(:,n_train_clusters+1:npops) = []; + new_npops = n_train_clusters; + return +elseif isempty(pops) + % do nothing + + new_npops = npops; + return +else + n_nonempty = length(pops); %the pops should be in ascending order + for i=1:n_nonempty + PARTITION(PARTITION==pops(i))= n_train_clusters+i; + CQ_COUNTS(:,:,n_train_clusters+i) = CQ_COUNTS(:,:,pops(i)); + CQ_SUMCOUNTS(n_train_clusters+i,:) = CQ_SUMCOUNTS(pops(i),:); + SP_COUNTS(:,:,n_train_clusters+i) = SP_COUNTS(:,:,pops(i)); + SP_SUMCOUNTS(n_train_clusters+i,:) = SP_SUMCOUNTS(pops(i),:); + + LOGDIFF(:,n_train_clusters+i) = LOGDIFF(:,pops(i)); + end + + CQ_COUNTS(:,:,n_train_clusters+n_nonempty+1:npops) = []; + CQ_SUMCOUNTS(n_train_clusters+n_nonempty+1:npops,:) = []; + SP_COUNTS(:,:,n_train_clusters+n_nonempty+1:npops) = []; + SP_SUMCOUNTS(n_train_clusters+n_nonempty+1:npops,:) = []; + + TRAIN_CQ_COUNTS(:,:,n_train_clusters+n_nonempty+1:npops) = []; + TRAIN_CQ_SUMCOUNTS(n_train_clusters+n_nonempty+1:npops,:) = []; + TRAIN_SP_COUNTS(:,:,n_train_clusters+n_nonempty+1:npops) = []; + TRAIN_SP_SUMCOUNTS(n_train_clusters+n_nonempty+1:npops,:) = []; + + LOGDIFF(:,n_train_clusters+n_nonempty+1:npops) = []; + + new_npops = n_train_clusters+n_nonempty; +end + + + +%-------------------------------------------------------------------------- + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex, n_train_pops) +% Tiedetään, että annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ssä ei vielä ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyistä partitiota vastaava nclusters:in arvo. Muutoin ei tehdä mitään. + +% global PARTITION; +% apu = isempty(find(abs(partitionSummary(:,2)-logml)<1e-5,1)); +% if apu +% % Nyt löydetty partitio ei ole vielä kirjattuna summaryyn. +% npops = length(unique(PARTITION)); +% partitionSummary(worstIndex,1) = npops; +% partitionSummary(worstIndex,2) = logml; +% added = 1; +% else +% added = 0; +% end + +% Modified by Lu Cheng, 11.03.2010 +% The population number should be greater than or equal to the training +% population number +global PARTITION; +apu = isempty(find(abs(partitionSummary(:,2)-logml)<1e-5,1)); +if apu + npops = sum(unique(PARTITION)>n_train_pops)+n_train_pops; + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + +%-------------------------------------------------------------------------- + +function [counts, sumcounts] = initialCounts(ind_counts) + +global PARTITION; + +pops = unique(PARTITION); +npops = max(pops); + +counts = zeros(size(ind_counts,1), size(ind_counts,2), npops,'uint16'); +sumcounts = zeros(npops, size(ind_counts,2),'uint16'); + +for i = 1:npops + inds = (PARTITION == i); + counts(:,:,i) = sum(ind_counts(:,:,inds), 3); + sumcounts(i,:) = sum(counts(:,:,i),1); +end + +%-------------------------------------------------------------------------- + +function [counts, sumcounts] = calCounts(ind_counts, npops) +% This function calculates the counts for each cluster from the sample data +% 'ind_counts', which are the counts for each individual +% The input 'npops' is a user defined value, which means the max number of +% potential populations in the data. Usually it is bigger than the number +% of populations given by 'PARTITION'. + +% modified by Lu Cheng, 08.03.2010 + +global PARTITION; % n_samples * 1 vector, indicate the cluster_id for each sample + +pops = unique(PARTITION); +pops_len = length(pops); + +[n_alle n_loci n_inds] = size(ind_counts); +counts = zeros(n_alle, n_loci, npops,'uint16'); +sumcounts = zeros(npops, n_loci,'uint16'); + +for i = 1:pops_len + cluster = pops(i); + inds = (PARTITION == cluster); + counts(:,:,cluster) = sum(ind_counts(:,:,inds), 3); + sumcounts(cluster,:) = squeeze(sum(counts(:,:,cluster),1)); +end + +%-------------------------------------------------------------------------- + +function logml = computeLogml(adjprior_cq, adjprior_sp) + +% calculate the sum logml for all the populations + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +global TRAIN_CQ_COUNTS; +global TRAIN_SP_COUNTS; +global TRAIN_CQ_SUMCOUNTS; +global TRAIN_SP_SUMCOUNTS; + +npops = size(TRAIN_CQ_COUNTS,3); % Modified by Lu Cheng, 08.03.2010 +cq_counts = TRAIN_CQ_COUNTS + double(CQ_COUNTS); +cq_sumcounts = TRAIN_CQ_SUMCOUNTS + double(CQ_SUMCOUNTS); +sp_counts = TRAIN_SP_COUNTS + double(SP_COUNTS); +sp_sumcounts = TRAIN_SP_SUMCOUNTS + double(SP_SUMCOUNTS); + +cq_logml = sum(sum(sum(gammaln(cq_counts+repmat(adjprior_cq,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior_cq))) - ... + sum(sum(gammaln(1+cq_sumcounts))); + +sp_logml = sum(sum(sum(gammaln(sp_counts+repmat(adjprior_sp,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior_sp))) - ... + sum(sum(gammaln(1+sp_sumcounts))); + +logml = cq_logml - sp_logml; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%-------------------------------------------------------------------------- + +function popLogml = computePopulationLogml(pops, adjprior_cq, adjprior_sp) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +% calculate logml for each population given by pops +% returns a length(pops)*1 vector + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +global TRAIN_CQ_COUNTS; +global TRAIN_SP_COUNTS; +global TRAIN_CQ_SUMCOUNTS; +global TRAIN_SP_SUMCOUNTS; + +%npops = size(TRAIN_CQ_COUNTS,3); % Modified by Lu Cheng, 08.03.2010 +cq_counts = TRAIN_CQ_COUNTS + double(CQ_COUNTS); +cq_sumcounts = TRAIN_CQ_SUMCOUNTS + double(CQ_SUMCOUNTS); +sp_counts = TRAIN_SP_COUNTS + double(SP_COUNTS); +sp_sumcounts = TRAIN_SP_SUMCOUNTS + double(SP_SUMCOUNTS); + +nall_cq = size(CQ_COUNTS,1); +nall_sp = size(SP_COUNTS,1); +ncliq = size(CQ_COUNTS,2); +nsep = size(SP_COUNTS, 2); + +z = length(pops); + +popLogml_cq = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_cq,[1 1 z]) + cq_counts(:,:,pops)) ... + ,[nall_cq ncliq z]),1),2)) - sum(gammaln(1+cq_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_cq))); + +popLogml_sp = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_sp,[1 1 z]) + sp_counts(:,:,pops)) ... + ,[nall_sp nsep z]),1),2)) - sum(gammaln(1+sp_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_sp))); + +popLogml = popLogml_cq - popLogml_sp; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%------------------------------------------------------------------- + + + + +%-------------------------------------------------------------- +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +% newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) && n= 1 + suurinYks = suurinYks+1; + end + if suurinYks<10 + mjono(7) = num2str(suurinYks); + mjono(6) = 'e'; + mjono(5) = palautaYks(abs(logml),suurinYks-1); + mjono(4) = '.'; + mjono(3) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(2) = '-'; + end + elseif suurinYks>=10 + mjono(6:7) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(logml),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(1) = '-'; + end + end +end + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:inä +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmillä +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhjiä +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end + +%-------------------------------------------------------------------------- + + diff --git a/matlab/independent/semi_linkageMixture_speed.m b/matlab/independent/semi_linkageMixture_speed.m new file mode 100644 index 0000000..07083d7 --- /dev/null +++ b/matlab/independent/semi_linkageMixture_speed.m @@ -0,0 +1,651 @@ +function semi_linkageMixture_speed(c_train, c_test) +% This function process adjusts the priors of the training data accoring to +% the test data. Based on the adjusted priors, the test data is clustered. + +% modified from linkageMixture_speed.m by Lu Cheng, 16.02.2010 + +% Update by Lu Cheng, 07.03.2011 +% case of only 1 sample in the test data has been handled + +% added by Lu Cheng, 11.03.2010 +global SCRIPT_MODE; +global PARAMETERS; +if isempty(SCRIPT_MODE) + SCRIPT_MODE = false; +end +% ----------------- + + +%% compare the training data and test data, adjust priors + +%1% Compare the training data and test data to adjust the prior +if ~all(all(c_train.component_mat == c_test.component_mat)) + disp('The gene lengths are different between the training data and the test data!'); + return; +end + +flag = false; % whether the trained priors should be adjusted +n_loci = size(c_train.alleleCodes,2); + +if c_train.rowsFromInd ~= c_test.rowsFromInd + error('Inconsistant rows from each individual. Train: %d Test: %d. Quit! \n', ... + c_train.rowsFromInd, c_test.rowsFromInd); + return; +elseif c_train.rowsFromInd > 1 + error('Data must be haploid. Quit! rowsFromInd: %d.\n', c_train.rowsFromInd); + return; +end + +for i=1:n_loci + if flag; break; end + a = setdiff(c_test.alleleCodes(:,i),c_train.alleleCodes(:,i)); + a = a(a~=0); + if ~isempty(a) + flag = true; + fprintf('New alleles are detected in the test data at loci: %d\n',i); + fprintf('The processing time will be much longer than usual. \n'); + end +end + +%2% reprocess the data for clustering +if flag + + % combing the training data and test data, adjust the priors + combine_data = [c_train.data; c_test.data]; + n_train = size(c_train.data,1); + n_samples = size(combine_data,1); + combine_data(:,end) = (1:n_samples)'; + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(combine_data); + + index = data(:,end); + [data_clique, data_separator, noalle_clique, noalle_separator] = ... + transform4(data, c_train.component_mat,'codon'); + data_clique = [data_clique index]; + data_separator = [data_separator index]; + + % Count the data + [counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq]... + = allfreqsnew2(data_clique, double(noalle_clique)); + [counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp]... + = allfreqsnew2(data_separator, double(noalle_separator)); + + counts_cq = uint16(counts_cq); + counts_sp = uint16(counts_sp); + + c_train.adjprior = adjprior; + + c_train.counts_cq = counts_cq(:,:,1:n_train); + c_train.counts_sp = counts_sp(:,:,1:n_train); + + c_test.counts_cq = counts_cq(:,:,n_train+1:end); + c_test.counts_sp = counts_sp(:,:,n_train+1:end); + + c_train.adjprior_cq = adjprior_cq; + c_train.adjprior_sp = adjprior_sp; + + c_train.alleleCodes = alleleCodes; + c_train.noalle = noalle; + + clear data rowsFromInd alleleCodes noalle adjprior priorTerm index + clear data_clique data_separator noalle_clique noalle_separator + clear counts_cq counts_sp nalleles_cq nalleles_sp prior_cq prior_sp adjprior_cq adjprior_sp + +else + + % KEY: adjust the test data to fit the configuration of the training data + % the 'codes_cq' and 'codes_sp' are directly translated from DNA sequence + % SEE 'i_encode_n.m' under the linkage folder + + num_cq = size(c_train.counts_cq, 1); num_sp = size(c_train.counts_sp, 1); + n_loci_cq = size(c_train.counts_cq, 2); n_loci_sp = size(c_train.counts_sp, 2); + n_inds = size(c_test.counts_cq, 3); + + counts_cq = zeros(num_cq, n_loci_cq, n_inds); + counts_sp = zeros(num_sp, n_loci_sp, n_inds); + + % mapping the indexes of cliques and separators of the test data to the + % indexes of the training data + for k = 1:n_inds + for j = 1:n_loci_cq + [c, ia, ib] = intersect(c_test.codes_cq{j}, c_train.codes_cq{j},'rows'); + counts_cq(ib,j,k) = c_test.counts_cq(ia,j,k); + end + + for j = 1:n_loci_sp + [c, ia, ib] = intersect(c_test.codes_sp{j}, c_train.codes_sp{j},'rows'); + counts_sp(ib,j,k) = c_test.counts_sp(ia,j,k); + end + end + + c_test.counts_cq = counts_cq; + c_test.counts_sp = counts_sp; + + clear c ia ib k j i; + +end + +%% cluster the test data + +% case of only 1 sample in the test data, added by Lu Cheng, 07.03.2011 +if size(c_test.data,1)~=1 + [Z,dist] = newGetDistances(c_test.data, c_test.rowsFromInd); + c_test.Z = Z; + c_test.dist = dist; +end + +clear Z dist; + +message = cat(2,'There are currently ',num2str(length(unique(c_train.cluster_labels))),' clusters in the training data, please input upper bounds of cluster numbers in the test data.'); + +if SCRIPT_MODE + cluster_nums = str2num(PARAMETERS.cluster_num_upperbounds); +else + cluster_nums = inputdlg(message); + if isempty(cluster_nums) == 1 + return; + else + cluster_nums = str2num(cluster_nums{:}); + end +end + +% % Test purpose, Check the input data, there should be 1 allele in each loci (column) +% % Lu Cheng, 25.02.2010 +% if ~all(all(squeeze(sum(c_train.counts_cq,1)))) +% disp('Missing cq value of some sample in counts_cq of the training data'); +% return; +% elseif ~all(all(squeeze(sum(c_train.counts_sp,1)))) +% disp('Missing sp value of some sample in counts_sp of the training data'); +% return; +% elseif ~all(all(squeeze(sum(c_test.counts_cq,1)))) +% disp('Missing cq value of some sample in counts_cq of the test data'); +% return; +% elseif ~all(all(squeeze(sum(c_test.counts_sp,1)))) +% disp('Missing sp value of some sample in counts_sp of the test data'); +% return; +% end + +tic +semi_res = semi_linkageMix(c_train, c_test, cluster_nums); +toc + +semi_res.popnames = c_test.popnames; + +writeMixtureInfo(semi_res); + +% save the results +if SCRIPT_MODE + save_results = PARAMETERS.save_results; +else + save_results = questdlg('Do you wish to save the results?',... + 'Save Results','Yes','No','Yes'); +end + +if isequal(save_results,'Yes') + if SCRIPT_MODE + save(PARAMETERS.result_file,'semi_res','-v7.3'); + else + [filename, pathname] = uiputfile('*.mat','Save the results as'); + if (sum(filename)==0) || (sum(pathname)==0) + % do nothing + else + save(strcat(pathname,filename),'semi_res','-v7.3'); + end + end +end; + +% ----------------------------------------------------------------------- + + + +%-------------------------------------------------------------------------- +%% The next three functions are for computing the initial partition +% according to the distance between the individuals + +function initial_partition=admixture_initialization(nclusters,Z) +T=cluster_own(Z,nclusters); +initial_partition=T; + +function T = cluster_own(Z,nclust) +% true=logical(1); +% false=logical(0); + +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); +% maximum number of clusters based on inconsistency +if m <= maxclust + T = (1:m)'; +elseif maxclust==1 + T = ones(m,1); + +else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end +end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + +%-------------------------------------------------------------------------- + +function Z = computeLinkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 || m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +% monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + +%-------------------------------------------------------------------------- + +%------------------------------------------------------------------------- + +function [Z, dist] = newGetDistances(data, rowsFromInd) + +ninds = max(data(:,end)); +nloci = size(data,2)-1; +riviLkm = nchoosek(double(ninds),2); + +% empties = find(data<0); +% data(empties)=0; +data(logical(data<0)) = 0; +data = uint16(data); + +pariTaulu = zeros(riviLkm,2); +aPointer=1; + +for a=1:ninds-1 + pariTaulu(aPointer:aPointer+double(ninds-1-a),1) = ones(ninds-a,1,'uint16')*a; + pariTaulu(aPointer:aPointer+double(ninds-1-a),2) = uint16((a+1:ninds)'); + aPointer = aPointer+double(ninds-a); +end + +eka = pariTaulu(:,ones(1,rowsFromInd)); +eka = eka * rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]); +eka = eka - miinus; + +toka = pariTaulu(:,ones(1,rowsFromInd)*2); +toka = toka * rowsFromInd; +toka = toka - miinus; + +eka = uint16(eka); +toka = uint16(toka); + +clear pariTaulu; clear miinus; + +summa = uint16(zeros(riviLkm,1)); +vertailuja = uint16(zeros(riviLkm,1)); + +x = zeros(size(eka)); x = uint16(x); +y = zeros(size(toka)); y = uint16(y); +% fprintf(1,'%%10'); +for j=1:nloci; + + for k=1:rowsFromInd + x(:,k) = data(eka(:,k),j); + y(:,k) = data(toka(:,k),j); + end + + for a=1:rowsFromInd + for b=1:rowsFromInd + vertailutNyt = uint16(x(:,a)>0 & y(:,b)>0); + vertailuja = vertailuja + vertailutNyt; + lisays = (x(:,a)~=y(:,b) & vertailutNyt); + summa = summa + uint16(lisays); + end + end + % fprintf(1,'\b\b'); + % fprintf(1,'%d',floor(10+80*j/nloci)); +end + +clear x; clear y; clear vertailutNyt; +clear eka; clear toka; clear data; clear lisays; +dist = zeros(length(vertailuja),1); +% nollat = find(vertailuja==0); +% dist(nollat) = 1; +dist(logical(vertailuja==0)) = 1; +muut = find(vertailuja>0); +dist(muut) = double(summa(muut))./double(vertailuja(muut)); +clear summa; clear vertailuja; clear muut; + +Z = computeLinkage(dist'); +% fprintf(1,'\b\b'); +% fprintf(1,'%d\n',100); +%-------------------------------------------------------------------------- + +function writeMixtureInfo(c) + +outputFile = 'baps5_semi_output.txt'; + +% output the semi-supervised clustering results to the outputFile +% modified by Lu Cheng, 28.03.2010 + +ninds = length(c.PARTITION); +npops = c.npops; +popnames = c.popnames; +logml = c.logml; +partition = c.PARTITION; +partitionSummary = c.partitionSummary; + +if ~isempty(outputFile) + fid = fopen(outputFile,'w'); +else + fid = -1; + %diary('baps5_semi_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:'); +disp(['Number of clustered individuals: ' ownNum2Str(ninds)]); +disp(['Number of groups in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%10s\n', ['RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:']); + fprintf(fid,'%20s\n', ['Number of clustered individuals: ' ownNum2Str(ninds)]); + fprintf(fid,'%20s\n', ['Number of groups in optimal partition: ' ownNum2Str(npops)]); + fprintf(fid,'%20s\n\n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +end + +disp('Best Partition: '); +if (fid ~= -1) + fprintf(fid,'%s \n','Best Partition: '); +end +for m=1:npops + indsInM = find(partition==m); + + if isempty(indsInM) + continue; + end + + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + text = [text '}']; + + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',new_line); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',text); + end + end; +end + +names = true; + +clusterProbTable = c.clusterProbTable; +if npops == 1 + clusterProbTable = []; +else + disp(''); + disp('Posterior probability of assignment into clusters:'); + + if (fid ~= -1) + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', 'Posterior probability of assignment into clusters: '); fprintf(fid, '\n'); + end + + text = sprintf('%10s','ind'); + for ii = 1:npops + tmpstr = sprintf('%10s',num2str(ii)); + text = [text tmpstr]; + end + + disp(text); + if (fid ~= -1) + fprintf(fid, '%s \n', text); + end + + for ii = 1:ninds + text = sprintf('%10s',popnames{ii}{:}); + for jj = 1:npops + tmpstr = sprintf('%10s',num2str(clusterProbTable(ii,jj),'%10.6f')); + text = [text tmpstr]; + end + + if ii<100 + disp(text); + elseif ii==101 + disp('.......................................'); + disp('..........see output file..............'); + end + if (fid ~= -1) + fprintf(fid, '%s \n', text); + end + text = []; + end +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n\n', ' '); + fprintf(fid, '%s \n', 'List of sizes of 10 best visited partitions and corresponding log(ml) values'); fprintf(fid, '\n'); +end + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +partitionSummary = partitionSummary(logical(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1),'%20d') ' ' num2str(partitionSummary(part,2),'%20.6f')]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', line); + end +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + +%-------------------------------------------------------------- +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +% newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) && n=0)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(logical(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + % data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + data(logical(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +nrows = size(data,1); +ncols = size(data,2); +rowsFromInd = zeros(nind,1); +for i=1:nind + rowsFromInd(i) = length(find(data(:,end)==i)); +end +maxRowsFromInd = max(rowsFromInd); +a = -999; +emptyRow = repmat(a, 1, ncols); +lessThanMax = find(rowsFromInd < maxRowsFromInd); +missingRows = maxRowsFromInd*nind - nrows; +data = [data; zeros(missingRows, ncols)]; +pointer = 1; +for ind=lessThanMax' %K�y l�pi ne yksil�t, joilta puuttuu rivejï¿? + miss = maxRowsFromInd-rowsFromInd(ind); % T�ltï¿?yksil�ltï¿?puuttuvien lkm. + for j=1:miss + rowToBeAdded = emptyRow; + rowToBeAdded(end) = ind; + data(nrows+pointer, :) = rowToBeAdded; + pointer = pointer+1; + end +end +data = sortrows(data, ncols); % Sorttaa yksil�iden mukaisesti +newData = data; +rowsFromInd = maxRowsFromInd; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + +%-------------------------------------------------------------------------- diff --git a/matlab/independent/tmpscript.txt b/matlab/independent/tmpscript.txt new file mode 100644 index 0000000..9937a8e --- /dev/null +++ b/matlab/independent/tmpscript.txt @@ -0,0 +1,5 @@ +datafile('train','C:\BAPS5\burk_test\train_data.mat') +datafile('test','C:\BAPS5\burk_test\testdata\testdata_51_5.xls') +savePreproFile('test','C:\BAPS5\burk_test\testpreproc\testpreproc_51_5.mat') +setK('16') +outputmat('C:\BAPS5\burk_test\testres\testres_51_5.mat') diff --git a/matlab/independent/trainedMix.m b/matlab/independent/trainedMix.m new file mode 100644 index 0000000..0869457 --- /dev/null +++ b/matlab/independent/trainedMix.m @@ -0,0 +1,2293 @@ +function trainedMix + +% LASKENNAN ALKUARVOJEN M��RITT�MINEN + +global SCRIPT_MODE; + +if isempty(SCRIPT_MODE) + SCRIPT_MODE = false; +end + +if SCRIPT_MODE + input_type = 'MLST-format'; +else + input_type = questdlg('Specify the format of your data: ',... + 'Specify Data Format', ... + 'MLST-format', 'GenePop-format','MLST-format'); +end + +switch input_type + case 'MLST-format' + disp('MLST-format'); + processMLST + case 'GenePop-format' + disp('GenePop-format'); + processGenePop +end + +%-------------------------------------------------------------------------- + +function processMLST +% note that this version only works for windows with Excel installed +% Lu Cheng, 02.02.2010 +% lu.cheng@helsinki.fi + +% added by Lu Cheng, 11.03.2010 +global SCRIPT_MODE; +global PARAMETERS; +if isempty(SCRIPT_MODE) + SCRIPT_MODE = false; +end +% ---------- + +tmp_train_file = 'tmp8972_train.xls'; +if exist(tmp_train_file,'file')==2 + delete(tmp_train_file); +end + +%% process both the training data and test data + +% Format of the training excel file +% column 1: sample ID +% column 2: cluster label of each sample, an integer from 1 to K +% column 3-n: sequences of each gene +format1 = 'MS EXCEL FORMAT'; +format2 = 'PREPROCESSED FORMAT'; + +if SCRIPT_MODE + if isequal(PARAMETERS.train_file_format,'.xls') + input_type = format1; + elseif isequal(PARAMETERS.train_file_format,'.mat') + input_type = format2; + end +else + input_type = questdlg('Specify the format of your training data: ',... + 'Specify Data Format', format1, format2, format1); +end + +switch input_type + case format1 + + if SCRIPT_MODE + trained_file = PARAMETERS.train_file_name; + else + [filename, pathname] = uigetfile('*.xls', strcat('Load training data in',' ',format1)); + if filename==0 + return; + end + trained_file = strcat(pathname,filename); + end + + [A B] = xlsread(trained_file); + if size(B,1) == length(A)+1 + B(2:end,1) = num2cell(A(:,1)); + else + B(:,1) = num2cell(A(:,1)); + end + + train_xls = B(:,[1 3:end]); + cluster_labels = A(:,2); + + % the unique labels should be tightly from 1 to K + % added by Lu Cheng, 22.06.2010 + unique_labels = unique(cluster_labels); + if max(unique_labels)~=length(unique_labels) + error('The cluster labels are wrong, should be from 1 to %s !', num2str(length(unique_labels))); + end + + xlswrite(tmp_train_file,train_xls); + clear A B trained_file unique_labels + + c_train = preprocessXLS(tmp_train_file); + c_train.cluster_labels = cluster_labels; + delete(tmp_train_file); + + if SCRIPT_MODE + save_preproc = PARAMETERS.save_prepro_train_data; + else + save_preproc = questdlg('Do you wish to save the pre-processed training data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + end + + if isequal(save_preproc,'Yes'); + if SCRIPT_MODE +% save(PARAMETERS.train_prepro_file,'c_train'); + save(PARAMETERS.train_prepro_file,'c_train', '-v7.3'); % added by Lu Cheng, 08.06.2012 + else + [filename, pathname] = uiputfile('*.mat','Save pre-processed training data as'); + if (sum(filename)==0) || (sum(pathname)==0) + % do nothing + else +% save(strcat(pathname,filename,'.mat'),'c_train'); + save(strcat(pathname,filename,'.mat'),'c_train','-v7.3'); % added by Lu Cheng, 08.06.2012 + end + end + end; + + case format2 + disp(format2); + + if SCRIPT_MODE + trained_file = PARAMETERS.train_file_name; + else + [filename, pathname] = uigetfile('*.mat', strcat('Load training data in',' ',format2)); + if filename==0 + return; + end + trained_file = strcat(pathname,filename); + end + + clear c_train + load('-mat',trained_file); + + otherwise + return; +end + +%% process with test data + +if SCRIPT_MODE + if isequal(PARAMETERS.test_file_format,'.xls') + input_type = format1; + elseif isequal(PARAMETERS.test_file_format,'.mat') + input_type = format2; + end +else + input_type = questdlg('Specify the format of your test data: ',... + 'Specify Data Format', format1, format2, format1); +end + +switch input_type + case format1 + if SCRIPT_MODE + test_file = PARAMETERS.test_file_name; + else + [filename, pathname] = uigetfile('*.xls', 'Load test data (unlabeled) in MLST-format'); + if filename==0 + return; + end + test_file = strcat(pathname,filename); + end + c_test = preprocessXLS(test_file,c_train); + + if SCRIPT_MODE + save_preproc = PARAMETERS.save_prepro_test_data; + else + save_preproc = questdlg('Do you wish to save the pre-processed test data?',... + 'Save pre-processed data?','Yes','No','Yes'); + end + + if isequal(save_preproc,'Yes'); + if SCRIPT_MODE +% save(PARAMETERS.test_prepro_file,'c_test'); + save(PARAMETERS.test_prepro_file,'c_test','-v7.3'); % added by Lu Cheng, 08.06.2012 + else + [filename, pathname] = uiputfile('*.mat','Save pre-processed test data as'); + if (sum(filename)==0) || (sum(pathname)==0) + % do nothing + else +% save(strcat(pathname,filename,'.mat'),'c_test'); + save(strcat(pathname,filename,'.mat'),'c_test','-v7.3'); % added by Lu Cheng, 08.06.2012 + end + end + end; + + case format2 + if SCRIPT_MODE + test_file = PARAMETERS.test_file_name; + else + [filename, pathname] = uigetfile('*.mat', cat(2,'Load test data (unlabeled) in ',format2)); + if filename==0 + return; + end + test_file = strcat(pathname,filename); + end + load('-mat',test_file,'c_test'); + + otherwise + return; +end + +%% compare the preprocessed training and test data and further steps + +semi_linkageMixture_speed(c_train, c_test); + +%-------------------------------------------------------------------------- +function processGenePop +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +global ADJPRIOR; +global PRIORTERM; +global SUMPRIOR; +global LOGDIFF; +clearGlobalVars; + +[filename, pathname] = uigetfile('*.txt', 'Load prior data in GenePop-format'); +if filename==0 + return; +end +kunnossa = testaaGenePopData([pathname filename]); +if kunnossa==0 + return +end + +waitALittle; +[filename2, pathname2] = uigetfile('*.txt', 'Load sampling units in GenePop-format'); +if filename2==0 + return; +end +kunnossa = testaaGenePopData([pathname2 filename2]); +if kunnossa==0 + return +end +clear kunnossa; + +[pData, pNames, pIndNames]=lueGenePopDataPop([pathname filename]); +[suData, suNames, suIndNames] = lueGenePopDataPop([pathname2 filename2]); +if size(pData,2) ~= size(suData,2) + disp('Incorrect input'); + return +end +inp = [filename ' & ' filename2]; +h0 = findobj('Tag','filename1_text'); +set(h0,'String',inp); +clear h0; clear inp; +clear filename; clear filename2; clear pathname; clear pathname2; + +[alleleCodes, noalle, suData, pData] = examineAlleles(suData, pData); + +rows = initializeRows(suData); % Samplin unit:ien rivit kertova muuttuja. +rowsFromInd = 2; %Tiedet��n GenePop:in tapauksessa. +data = suData(:,1:end-1); %Klusteroitavat "yksil�t" +priorLastCol = pData(:,end); +priorPartition = priorLastCol(1:rowsFromInd:end); % Prioriyksil�iden partitio +clear suData; clear priorLastCol; %Ei tarvita. Kai...? + +npopstext = []; +ready = false; +teksti = 'Input upper bound to the number of populations (possibly multiple values): '; +while ready == false + npopstextExtra = inputdlg(teksti ,... + 'Input maximum number of populations',1,{'20'}); + if isempty(npopstextExtra) % Painettu Cancel:ia + return + end + npopstextExtra = npopstextExtra{1}; + if length(npopstextExtra)>=255 + npopstextExtra = npopstextExtra(1:255); + npopstext = [npopstext ' ' npopstextExtra]; + teksti = 'The input field length limit (255 characters) was reached. Input more values: '; + else + npopstext = [npopstext ' ' npopstextExtra]; + ready = true; + end +end +clear ready; clear teksti; +if isempty(npopstext) | length(npopstext)==1 + return +else + npopsTaulu = str2num(npopstext); + clear npopstext; + if length(npopsTaulu)<1 + disp('Incorrect input'); + return + end + if any(npopsTaulu < size(pNames,1)) + disp('Incorrect input'); + return + end +end + +nruns = length(npopsTaulu); + +logmlBest = -1e50; +partitionSummary = -1e50*ones(30,2); % Tiedot 30 parhaasta partitiosta (npops ja logml) +partitionSummary(:,1) = zeros(30,1); +worstLogml = -1e50; worstIndex = 1; +Z = []; +for run = 1:nruns + npops = npopsTaulu(run); + dispLine; + disp(['Run ' num2str(run) '/' num2str(nruns) ... + ', maximum number of populations ' num2str(npops) '.']); + + disp(['Simulation started with ' num2str(npops) ' initial populations.']); + adjprior = computePriors(pData, npops, noalle); %adjprior on yhden populaation, jossa ei havaintoja. + COUNTS = zeros(size(ADJPRIOR)); + SUMCOUNTS = zeros(size(SUMPRIOR)); + POP_LOGML = zeros(npops,1); + + POP_LOGML = computePopulationLogml(1:npops); + logml = initialPopCounts(data, npops, rows, noalle); %Alustetaan COUNTS, PARTITION ... + + if isempty(Z) % Lasketaan vain ensimm�isellï¿?kierroksella. + if size(rows,1)==1 + Z = []; + dist = []; + else + [Z,dist] = getPopDistancesByKL(data, rows, noalle, adjprior); %Lasketaan sampling unit:ien v�liset et�isyydet. + end + end + + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + + % PARHAAN MIXTURE-PARTITION ETSIMINEN + nRoundTypes = 7; + kokeiltu = zeros(nRoundTypes, 1); + + roundTypes = [1 1]; %Ykk�svaiheen sykli kahteen kertaan. + ready = 0; vaihe = 1; + ninds = length(PARTITION); % num of sampling units + LOGDIFF = repmat(-Inf,ninds,npops); + + disp(' '); + while ready ~= 1 + muutoksia = 0; + + disp(['Performing steps: ' num2str(roundTypes)]); + + for n = 1:length(roundTypes) + + round = roundTypes(n); + kivaluku=0; + + if kokeiltu(round) == 1 + + elseif round==0 | round==1 %Yksil�n siirt�minen toiseen populaatioon. + inds = 1:ninds; + aputaulu = [inds' rand(ninds,1)]; + aputaulu = sortrows(aputaulu,2); + inds = aputaulu(:,1)'; + + muutosNyt = 0; + for ind = inds + i1 = PARTITION(ind); + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data); + + if round==1, [maxMuutos, i2] = max(muutokset); + end + + if (i1~=i2 & maxMuutos>1e-5) + % Tapahtui muutos + if muutosNyt == 0 + disp('Action 1'); + muutosNyt = 1; + kokeiltu = zeros(nRoundTypes,1); + end + muutoksia = 1; + kivaluku = kivaluku+1; + updateGlobalVariables(ind, i2, diffInCounts); + logml = logml+maxMuutos; + + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + end + end + if muutosNyt == 0 + kokeiltu(round) = 1; + end + + elseif round==2 & ~isempty(dist) %Populaation yhdist�minen toiseen. + maxMuutos = 0; + for pop = 1:npops + [muutokset, diffInCounts] = laskeMuutokset2(pop, rows, ... + data); + [isoin, indeksi] = max(muutokset); + if isoin>maxMuutos + maxMuutos = isoin; + i1 = pop; + i2 = indeksi; + diffInCountsBest = diffInCounts; + end + end + + if maxMuutos>1e-5 + muutoksia = 1; + disp('Action 2'); + kokeiltu = zeros(nRoundTypes,1); + updateGlobalVariables2(i1,i2, diffInCountsBest); + logml = logml + maxMuutos; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + else + kokeiltu(round) = 1; + end + + + elseif (round==3 | round==4) & ~isempty(dist)%Populaation jakaminen osiin. + maxMuutos = 0; + ninds = size(rows,1); + for pop = 1:npops + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2>2 + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = linkage(dist2'); + if round==3 + npops2 = min(20, floor(ninds2 / 5)); + elseif round==4 + npops2 = 2; %Moneenko osaan jaetaan + end + T2 = cluster_own(Z2, npops2); + muutokset = laskeMuutokset3(T2, inds2, rows, data, pop); + [isoin, indeksi] = max(muutokset(1:end)); + if isoin>maxMuutos + maxMuutos = isoin; + muuttuvaPop2 = rem(indeksi,npops2); + if muuttuvaPop2==0, muuttuvaPop2 = npops2; end + muuttuvat = inds2(find(T2==muuttuvaPop2)); + i2 = ceil(indeksi/npops2); + end + end + end + if maxMuutos>1e-5 + muutoksia = 1; + disp(['Action ' num2str(round)]); + kokeiltu = zeros(nRoundTypes,1); + %rows = computeRows(rowsFromInd, muuttuvat, length(muuttuvat)); + rivit = []; + for ind = muuttuvat + lisa = rows(ind,1):rows(ind,2); + rivit = [rivit; lisa']; + %rivit = [rivit; rows(ind)']; + end + diffInCounts = computeDiffInCounts(rivit', size(COUNTS,1), ... + size(COUNTS,2), data); + i1 = PARTITION(muuttuvat(1)); + updateGlobalVariables3(muuttuvat, diffInCounts, i2); + logml = logml + maxMuutos; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + else + kokeiltu(round)=1; + end + + + elseif round == 5 & ~isempty(dist) + % K�y l�pi populaatioita. + % Yritï¿?poistaa niistï¿?yksil�itï¿?yksi + % kerrallaan. Lopeta heti, kun jonkin + % yksil�iden joukon poistaminen jostain + % populaatiosta aiheuttaa positiivisen + % muutoksen logml:��n. + + pop=0; + muutettu = 0; + poplogml = POP_LOGML; partition = PARTITION; + counts = COUNTS; sumcounts = SUMCOUNTS; + logdiff = LOGDIFF; + + while (pop < npops & muutettu == 0) + pop = pop+1; + totalMuutos = 0; + inds = find(PARTITION==pop)'; + inds = returnInOrder(inds, pop, rows, data); + + i=0; + while (length(inds)>0 & i 0 + i = i+1; + ind = inds(i); + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, data); + muutokset(pop) = -1e50; % Varmasti ei suurin!!! + [maxMuutos, i2] = max(muutokset); + updateGlobalVariables(ind, i2, diffInCounts); + totalMuutos = totalMuutos+maxMuutos; + logml = logml+maxMuutos; + end + + if totalMuutos>1e-5 + disp('action 5'); + muutettu=1; + kokeiltu = zeros(nRoundTypes,1); + muutoksia = 1; % Ulompi kirjanpito. + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) [worstLogml, worstIndex] = min(partitionSummary(:,2)); end + end + else + % Miss��n vaiheessa tila ei parantunut. + % Perutaan kaikki muutokset. + PARTITION = partition; + SUMCOUNTS = sumcounts; + POP_LOGML = poplogml; + COUNTS = counts; + LOGDIFF = logdiff; + logml = logml - totalMuutos; + kokeiltu(round)=1; + end + end + clear partition; clear sumcounts; clear counts; clear poplogml; + end + end + + if muutoksia == 0 + if vaihe==1 + vaihe = 2; + elseif vaihe==2 + vaihe = 3; + elseif vaihe==3 + ready = 1; + end + else + muutoksia = 0; + end + + if ready==0 + if vaihe==1 + roundTypes=[1]; + elseif vaihe==2 + roundTypes=[2 1]; + elseif vaihe==3 + roundTypes=[5 4 3 1 2]; + end + end + end + + + % TALLENNETAAN + + prioriPopLkm = size(pNames,1); + npops = poistaTyhjatPopulaatiot(prioriPopLkm); + POP_LOGML = computePopulationLogml(1:npops); + + n_clust_with_su = length(unique(PARTITION)); + disp(['Found partition with sampling units in ' num2str(n_clust_with_su) ' clusters.']); + disp(['Log(ml) = ' num2str(logml)]); + disp(' '); + + if logml>logmlBest + % P�ivitet��n parasta l�ydettyï¿?partitiota. + logmlBest = logml; + npopsBest = npops; + partitionBest = PARTITION; + countsBest = COUNTS; + sumCountsBest = SUMCOUNTS; + pop_logmlBest = POP_LOGML; + adjPriorBest = ADJPRIOR; + priorTermBest = PRIORTERM; + sumPriorBest = SUMPRIOR; + logdiffbest = LOGDIFF; + end +end + +logml = logmlBest; +npops = npopsBest; +PARTITION = partitionBest; +COUNTS = countsBest; +SUMCOUNTS = sumCountsBest; +POP_LOGML = pop_logmlBest; +ADJPRIOR = adjPriorBest; +PRIORTERM = priorTermBest; +SUMPRIOR = sumPriorBest; +LOGDIFF = logdiffbest; + +h0 = findobj('Tag','filename1_text'); inp = get(h0,'String'); +h0 = findobj('Tag','filename2_text'); outp = get(h0,'String'); +writeTrainedMixtureInfo(logml, rows, data, outp, inp, ... + suIndNames, suNames, pIndNames, pNames, partitionSummary); + +fiksaaPartitioYksiloTasolle(rows, rowsFromInd); +[data, popnames] = muokkaaMuuttujat(adjprior, rowsFromInd, ... + pNames, suNames, priorPartition, pData, data); +viewMixPartition(PARTITION, popnames); + +talle = questdlg(['Do you want to save the mixture populations ' ... + 'so that you can use them later in admixture analysis?'], ... + 'Save results?','Yes','No','Yes'); +if isequal(talle,'Yes') + waitALittle; + [filename, pathname] = uiputfile('*.mat','Save results as'); + + if (filename == 0) & (pathname == 0) + % Cancel was pressed + return + else % copy 'baps4_output.baps' into the text file with the same name. + if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',[pathname filename '.txt']) + delete('baps4_output.baps') + end + end + + c.PARTITION = PARTITION; c.COUNTS = COUNTS; c.SUMCOUNTS = SUMCOUNTS; + c.alleleCodes = alleleCodes; c.adjprior = adjprior; + c.rowsFromInd = rowsFromInd; c.popnames = popnames; + c.data = data; c.npops = npops; c.noalle = noalle; + c.mixtureType = 'trained'; +% save([pathname filename], 'c'); + save([pathname filename], 'c', '-v7.3'); % added by Lu Cheng, 08.06.2012 +else + if exist('baps4_output.baps','file') + delete('baps4_output.baps') + end +end + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedet��n, ettï¿?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ssï¿?ei vielï¿?ole +% annettua logml arvoa, niin lis�t��n worstIndex:in kohtaan uusi logml ja +% nykyistï¿?partitiota vastaava nclusters:in arvo. Muutoin ei tehdï¿?mit��n. + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt l�ydetty partitio ei ole vielï¿?kirjattuna summaryyn. + global PARTITION; + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + + +%-------------------------------------------------------------------------- + + + +function [data, popnames] = muokkaaMuuttujat(adjprior, rowsFromInd, ... + pNames, suNames, priorPartition, pData, data) +% Muokkaa kaikki tarvittavat muuttujat mixture result-file +% muotoisiksi. + +global PARTITION; global COUNTS; +global SUMCOUNTS; global ADJPRIOR; +nloci = size(data,2); +npops = size(COUNTS, 3); + +data = [pData(:,1:nloci) ; data]; +PARTITION = [priorPartition; PARTITION]; +priorCounts = ADJPRIOR-repmat(adjprior, [1 1 npops]); +COUNTS = COUNTS+priorCounts; +SUMCOUNTS = (squeeze(sum(COUNTS)))'; + +priorNinds = length(priorPartition); +for k = 1:size(suNames,1) + suNames{k,2} = suNames{k,2} + priorNinds; +end +popnames = [pNames; suNames]; + + +%------------------------------------------------------------------------- + + +function [alleleCodes, noalle, suData, pData] = examineAlleles(suData, pData) +% Poistetaan nollat molemmista datoista. Selvitet��n noalle ja +% alleleCodes ja muutetaan molemmat datat vastaamaan alleleCodes:ia. +% T�ssï¿?vaiheessa datojen viimeinen sarake kertoo yksik�n, jolle +% rivi kuuluu. +data = [pData; suData]; +nrows_prior = size(pData,1); +nloci = size(suData,2)-1; + +dataApu = data(:,1:nloci); %poistetaan nollat +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +dataApu = []; nollat = []; isoinAlleeli = []; + +noalle=zeros(1,nloci); %selvitet��n noalle +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end + +alleleCodes = zeros(max(noalle),nloci); %selvitet��n alleleCodes +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci %muutetaan alleelien koodit vastaamaan alleleCodes:ia + for all = 1:noalle(loc) + data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; +pData = data(1:nrows_prior , :); +suData = data(nrows_prior+1:end , :); + + +%---------------------------------------------------------------------- + +function adjprior = computePriors(pData, npops, noalle) +global ADJPRIOR; +global SUMPRIOR; +global PRIORTERM; +nloci = size(pData,2)-1; +max_noalle = max(noalle); +ADJPRIOR = zeros(max_noalle, nloci, npops); +PRIORTERM = zeros(npops, 1); +SUMPRIOR = zeros(npops, nloci); + +adjprior = zeros(max_noalle,nloci); +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; +end + +data = pData(:, 1:nloci); +for i = 1:npops + rivit = find(pData(:,end) == i)'; %Pit�� olla vaakavektori. + if ~isempty(rivit) + diffInCounts = computeDiffInCounts(rivit, max_noalle, nloci, data); + ADJPRIOR(:,:,i) = diffInCounts; + end + ADJPRIOR(:,:,i) = ADJPRIOR(:,:,i) + adjprior; + for j=1:nloci + SUMPRIOR(i,j) = sum(squeeze(ADJPRIOR(1:noalle(j), j , i))); + PRIORTERM(i) = PRIORTERM(i)+gammaln(SUMPRIOR(i,j)); + PRIORTERM(i) = PRIORTERM(i)-sum(gammaln(squeeze(ADJPRIOR(1:noalle(j),j,i)))); + end +end + + +%-------------------------------------------------------------- + +function rows = initializeRows(data) +% Lasketaan rows-muuttuja. T�ssï¿?vaiheessa datan +% viimeisessï¿?sarakkeessa on vielï¿?yksik�n kertova +% indeksi. +nind = max(data(:,end)); +rows = zeros(nind,2); +for i=1:nind + rivit = find(data(:,end)==i)'; + rows(i,1) = min(rivit); + rows(i,2) = max(rivit); +end + + +%---------------------------------------------------------------- + + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; +global ADJPRIOR; ADJPRIOR = []; +global PRIORTERM; PRIORTERM = []; +global SUMPRIOR; SUMPRIOR = []; +global LOGDIFF; LOGDIFF = []; + + +%-------------------------------------------------------------------- + +function [Z,distances] = getPopDistancesByKL(data, rows, noalle, adjprior) +% Laskee populaatioille et�isyydet +% k�ytt�en KL-divergenssiï¿? + +npops = size(rows,1); %Samplin unit:tien lkm +nloci=size(data,2); +maxnoalle = max(noalle); +counts = zeros(maxnoalle,nloci,npops); % Tilap�istï¿?k�ytt�� varten +sumcounts = zeros(npops,nloci); + +for i=1:npops + for j=1:nloci + i_rivit = rows(i,1):rows(i,2); + havainnotLokuksessa = find(data(i_rivit,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(i_rivit,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +distances = zeros(nchoosek(npops,2),1); + +d = zeros(maxnoalle, nloci, npops); +prior = adjprior; +prior(find(prior==1))=0; +nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yhtï¿?alleelia. +prior(1,nollia)=1; + +for pop1 = 1:npops + d(:,:,pop1) = (squeeze(counts(:,:,pop1))+prior) ./ repmat(sum(squeeze(counts(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); +end +pointer = 1; +for pop1 = 1:npops-1 + for pop2 = pop1+1:npops + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + distances(pointer) = div; + pointer = pointer+1; + end +end +Z=linkage(distances'); + +%-------------------------------------------------------------------------- + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + +%----------------------------------------------------------------------- + + +function logml = initialPopCounts(data, npops, rows, noalle) + +global COUNTS; +global SUMCOUNTS; +global PARTITION; +global POP_LOGML; +global ADJPRIOR; +global SUMPRIOR; +nloci=size(data,2); +ninds = size(rows,1); +COUNTS = zeros(max(noalle),nloci,npops); +SUMCOUNTS = zeros(npops,nloci); +PARTITION = zeros(1,ninds); + +inds = 1:ninds; +aputaulu = [inds' rand(ninds,1)]; +aputaulu = sortrows(aputaulu,2); +inds = aputaulu(:,1)'; +%omaPartitio = 1:6; %POIS!!!!!!!! +%omaPartitio = omaPartitio'; +%omaPartitio = omaPartitio(:,ones(30,1)); +%omaPartitio = omaPartitio'; +%omaPartitio = omaPartitio(:); %POIS +%keyboard; +for ind = inds % Sijoitetaan yksil�t yksi kerrallaan. + [muutokset, diffInCounts] = ... + laskePrioriMuutokset(ind, rows, data); + [maxMuutos, i2] = max(muutokset); + %i2 = omaPartitio(ind) %POIS + PARTITION(ind) = i2; + COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + POP_LOGML(i2) = computePopulationLogml(i2); + +end + +logml = laskeLoggis(COUNTS, SUMCOUNTS, ADJPRIOR, SUMPRIOR); + +%keyboard; + +%----------------------------------------------------------------------- + + +function loggis = laskeLoggis(counts, sumcounts, adjprior, sumprior) +npops = size(counts,3); + +logml2 = sum(sum(sum(gammaln(counts+adjprior)))) ... + - sum(sum(sum(gammaln(adjprior)))) ... + - sum(sum(gammaln(sumcounts+sumprior))) ... + + sum(sum(gammaln(sumprior))); +loggis = logml2; + + +%-------------------------------------------------------------------- + + +function kunnossa = testaaGenePopData(tiedostonNimi) +% kunnossa == 0, jos data ei ole kelvollinen genePop data. +% Muussa tapauksessa kunnossa == 1. + +kunnossa = 0; +fid = fopen(tiedostonNimi); +line1 = fgetl(fid); %ensimm�inen rivi +line2 = fgetl(fid); %toinen rivi +line3 = fgetl(fid); %kolmas + +if (isequal(line1,-1) | isequal(line2,-1) | isequal(line3,-1)) + disp('Incorrect file format'); fclose(fid); + return +end +if (testaaPop(line1)==1 | testaaPop(line2)==1) + disp('Incorrect file format'); fclose(fid); + return +end +if testaaPop(line3)==1 + %2 rivi t�ll�in lokusrivi + nloci = rivinSisaltamienMjonojenLkm(line2); + line4 = fgetl(fid); + if isequal(line4,-1) + disp('Incorrect file format'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin neljï¿?t�ytyy sis�lt�� pilkku. + disp('Incorrect file format'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedet��n, ettï¿?pys�htyy + pointer = pointer+1; + end + line4 = line4(pointer+1:end); %pilkun j�lkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format'); fclose(fid); + return + end +else + line = fgetl(fid); + lineNumb = 4; + while (testaaPop(line)~=1 & ~isequal(line,-1)) + line = fgetl(fid); + lineNumb = lineNumb+1; + end + if isequal(line,-1) + disp('Incorrect file format'); fclose(fid); + return + end + nloci = lineNumb-2; + line4 = fgetl(fid); %Eka rivi pop sanan j�lkeen + if isequal(line4,-1) + disp('Incorrect file format'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin t�ytyy sis�lt�� pilkku. + disp('Incorrect file format'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedet��n, ettï¿?pys�htyy. + pointer = pointer+1; + end + + line4 = line4(pointer+1:end); %pilkun j�lkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format'); fclose(fid); + return + end +end +kunnossa = 1; +fclose(fid); + +%-------------------------------------------------------------------- + + +function [data, popnames, indnames] = lueGenePopDataPop(tiedostonNimi) +% Data annetaan muodossa, jossa viimeinen sarake kertoo ryhm�n. +% popnames on kuten ennenkin. + +fid = fopen(tiedostonNimi); +line = fgetl(fid); %ensimm�inen rivi +line = fgetl(fid); %toinen rivi +count = rivinSisaltamienMjonojenLkm(line); + +line = fgetl(fid); +lokusRiveja = 1; +while (testaaPop(line)==0) + lokusRiveja = lokusRiveja+1; + line = fgetl(fid); +end + +if lokusRiveja>1 + nloci = lokusRiveja; +else + nloci = count; +end + +popnames = cell(10,2); +indnames = cell(100,1); +data = zeros(100, nloci+1); +nimienLkm=0; +ninds=0; +poimiNimi=1; +digitFormat = -1; +while line ~= -1 + line = fgetl(fid); + + if poimiNimi==1 + %Edellinen rivi oli 'pop' + nimienLkm = nimienLkm+1; + ninds = ninds+1; + if nimienLkm>size(popnames,1); + popnames = [popnames; cell(10,2)]; + end + nimi = lueNimi(line); + if digitFormat == -1 + digitFormat = selvitaDigitFormat(line); + divider = 10^digitFormat; + end + popnames{nimienLkm, 1} = {nimi}; %N�in se on greedyMix:iss�kin?!? + popnames{nimienLkm, 2} = ninds; + poimiNimi=0; + + data = addAlleles(data, ninds, line, divider); + + if ninds>size(indnames,1) + indnames = [indnames; cell(100,1)]; + end + indnames{ninds} = {nimi}; + + elseif testaaPop(line) + poimiNimi = 1; + + elseif line ~= -1 + ninds = ninds+1; + nimi = lueNimi(line); + data = addAlleles(data, ninds, line, divider); + + if ninds>size(indnames,1) + indnames = [indnames; cell(100,1)]; + end + indnames{ninds} = {nimi}; + end +end + +indnames = indnames(1:ninds); +fclose(fid); +data = data(1:ninds*2,:); +popnames = popnames(1:nimienLkm,:); +npops = size(popnames,1); +ind = 1; +for pop = 1:npops + if pop=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + +%------------------------------------------------------------------------ + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, diffInCounts) +% Suorittaa globaalien muuttujien muutokset, kun yksilï¿?ind +% on siirret��n koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2]); + + +%-------------------------------------------------------------------------- +%-- + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2( ... + i1, globalRows, data); +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mikï¿?olisi +% muutos logml:ssï¿? mik�li korin i1 kaikki yksil�t siirret��n +% koriin i. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +i1_logml = POP_LOGML(i1); + +inds = find(PARTITION==i1); +ninds = length(inds); + +if ninds==0 + diffInCounts = zeros(size(COUNTS,1), size(COUNTS,2)); + return; +end + +rows = []; +for ind = inds + lisa = globalRows(ind,1):globalRows(ind,2); + rows = [rows; lisa']; + %rows = [rows; globalRows{ind}']; +end + +diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml(i1); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); +new_i2_logml = computePopulationLogml(i2); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2( ... + i1, i2, diffInCounts); +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksil�t siirret��n koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2); + + +%-------------------------------------------------------------------------- +%---- + +function muutokset = laskeMuutokset3(T2, inds2, globalRows, ... + data, i1) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mikï¿?olisi muutos logml:ssï¿? jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirret��n koriin j. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +i1_logml = POP_LOGML(i1); + +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = []; + for ind = inds + lisa = globalRows(ind,1):globalRows(ind,2); + rows = [rows; lisa']; + %rows = [rows; globalRows{ind}']; + end + diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + new_i1_logml = computePopulationLogml(i1); + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); + new_i2_logml = computePopulationLogml(i2)'; + COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + + muutokset(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + + + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, diffInCounts, i2); +% Suorittaa globaalien muuttujien p�ivitykset, kun yksil�t 'muuttuvat' +% siirret��n koriin i2. Ennen siirtoa yksil�iden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2]); + + +%---------------------------------------------------------------------------- + + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sis�lt�� yksil�iden inds2 +% v�liset et�isyydet. ninds=kaikkien yksil�iden lukum��rï¿? + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + +%-------------------------------------------------------------------------- + + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + + +%----------------------------------------------------------------------------------- + + +function npops = poistaTyhjatPopulaatiot(prioriPopLkm) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista, ADJPRIOR:ista ja SUMPRIOR:ista. +% P�ivitt�� npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; +global ADJPRIOR; +global SUMPRIOR; +global LOGDIFF; + +notEmpty = union(find(any(SUMCOUNTS,2)) , 1:prioriPopLkm); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); + +ADJPRIOR = ADJPRIOR(:,:,notEmpty); +SUMPRIOR = SUMPRIOR(notEmpty,:); +LOGDIFF = LOGDIFF(:,notEmpty); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile) + +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) & ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = 0; +end + + +%------------------------------------------------------------------------- + + +function [popnames2, rowsFromInd] = findOutRowsFromInd(popnames, rows) + +ploidisuus = questdlg('Specify the type of individuals in the data: ',... + 'Individual type?', 'Haploid', 'Diploid', 'Tetraploid', ... + 'Diploid'); + +switch ploidisuus +case 'Haploid' + rowsFromInd = 1; +case 'Diploid' + rowsFromInd = 2; +case 'Tetraploid' + rowsFromInd = 4; +end + +if ~isempty(popnames) + for i = 1:size(rows,1) + popnames2{i,1} = popnames{i,1}; + rivi = rows(i,1):rows(i,2); + popnames2{i,2} = (rivi(rowsFromInd))/rowsFromInd; + end +else + popnames2 = []; +end + +%------------------------------------------------------------------ + +function fiksaaPartitioYksiloTasolle(rows, rowsFromInd) + +global PARTITION; +totalRows = 0; +for ind = 1:size(rows,1) + totalRows = totalRows + (rows(ind,2)-rows(ind,1)+1); +end +partitio2 = zeros(totalRows/rowsFromInd,1); + +for ind = 1:size(rows,1) + kaikkiRivit = rows(ind,1):rows(ind,2); + for riviNumero = rowsFromInd:rowsFromInd:length(kaikkiRivit) + %for riviNumero = rowsFromInd:rowsFromInd:length(rows{ind}) + %rivi = rows{ind}(riviNumero); + rivi = kaikkiRivit(riviNumero); + partitio2(rivi/rowsFromInd) = PARTITION(ind); + end +end +PARTITION = partitio2; + +%--------------------------------------------------------------- + + +%-------------------------------------------------------------------- + + +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) & n0 + fid = fopen(outPutFile,'a'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF TRAINED MIXTURE ANALYSIS:'); +disp(['Data file: ' inputFile]); +disp(['Number of clustered groups: ' ownNum2Str(ninds)]); +disp(['Number of populations having prior information: ' ownNum2Str(size(pNames,1))]); +disp(['In the optimal partition the samling units were in ' ownNum2Str(n_clust_with_su) ' clusters.']); +disp(['Log(marginal likelihood) of the optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%s \n', [' ']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['RESULTS OF TRAINED MIXTURE ANALYSIS:']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Data file: ' inputFile]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clustered groups: ' ownNum2Str(ninds)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of populations having prior information: ' ownNum2Str(size(pNames,1))]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['In the optimal partition the sampling units were in ' ownNum2Str(n_clust_with_su) ' clusters.']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Log(marginal likelihood) of the optimal partition: ' ownNum2Str(logml)]); fprintf(fid,'\n'); + fprintf(fid,'\n'); +end + +%cluster_count = length(unique(PARTITION)); +cluster_count = size(COUNTS,3); +disp(['Best Partition: ']); +if (fid ~= -1) + fprintf(fid,'%s \n',['Best Partition: ']); fprintf(fid,'\n'); +end +for m=1:cluster_count + susInM = find(PARTITION==m); %Sampling units in pop m. + text = ['Cluster ' num2str(m) ': {']; + length_of_beginning = 11 + floor(log10(m)); + + if m < size(pNames,1) + % populaatiolle on allokoitu prioriyksil�itï¿? + text = [text '[']; + k = pNames{m,2}; + text = [text pIndNames{k}{1}]; + for k = pNames{m,2}+1:pNames{m+1,2}-1 + text = [text ', ' pIndNames{k}{1}]; + end + text = [text '], ']; + elseif m == size(pNames,1) + text = [text '[']; + k = pNames{m,2}; + text = [text pIndNames{k}{1}]; + for k = pNames{m,2}+1:length(pIndNames) + text = [text ', ' pIndNames{k}{1}]; + end + text = [text '], ']; + end + + cluster_size = length(susInM); + + for k = 1:cluster_size % K�y l�pi m:��n kuuluvat samling unit:it + text = [text '[']; + su = susInM(k); % sampling unit su kuuluu populaatioon m. + ekaNimi = suNames{su,2}; + if su58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',[new_line]); + fprintf(fid,'\n'); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',[text]); + fprintf(fid,'\n'); + end + end; +end + +if npops > 1 + + disp(' '); + disp(' '); + disp('Changes in log(marginal likelihood) if sampling unit i is moved to cluster j:'); + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Changes in log(marginal likelihood) if sampling unit i is moved to cluster j:']); fprintf(fid, '\n'); + end + + ekarivi = 'group '; + for i = 1:cluster_count + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; + end + disp(ekarivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); fprintf(fid, '\n'); + end + + for ind = 1:ninds + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, data); + rivi = [blanks(4-floor(log10(ind))) ownNum2Str(ind) ':']; + for j = 1:npops + rivi = [rivi ' ' logml2String(omaRound(muutokset(j)))]; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + end + end + + disp(' '); disp(' '); + disp('KL-divergence matrix in PHYLIP format:'); + dist_mat = zeros(npops, npops); + + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['KL-divergence matrix in PHYLIP format:']); %fprintf(fid, '\n'); + end + + maxnoalle = size(COUNTS,1); + nloci = size(COUNTS,2); + d = zeros(maxnoalle, nloci, npops); + for pop1 = 1:npops + prior = ADJPRIOR(:,:,pop1); + prior(find(prior==1))=0; + nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yhtï¿?alleelia. + prior(1,nollia)=1; + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); + end +% ekarivi = blanks(7); +% for pop = 1:npops +% ekarivi = [ekarivi num2str(pop) blanks(7-floor(log10(pop)))]; +% end + ekarivi = num2str(npops); + disp(ekarivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); %fprintf(fid, '\n'); + end + + for pop1 = 1:npops + rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + % rivi = [rivi kldiv2str(div) ' ']; + dist_mat(pop1,pop2) = div; + end +% disp(rivi); +% if (fid ~= -1) +% fprintf(fid, '%s \n', [rivi]); %fprintf(fid, '\n'); +% end + end + +end + +dist_mat = dist_mat + dist_mat'; % make it symmetric +for pop1 = 1:npops + rivi = ['Cluster_' num2str(pop1) ' ']; + for pop2 = 1:npops + rivi = [rivi kldiv2str(dist_mat(pop1,pop2)) ' ']; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); %fprintf(fid, '\n'); + end +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values:'); + +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['List of sizes of 10 best visited partitions and corresponding log(ml) values:']); fprintf(fid, '\n'); +end + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +partitionSummary = partitionSummary(find(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end + +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1)) ' ' num2str(partitionSummary(part,2))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end +end + +disp(' '); +disp(' '); +disp('Probabilities for number of clusters: (#clusters: prob)'); + +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Probabilities for number of clusters: (#clusters: prob)']); fprintf(fid, '\n'); +end + +npopsTaulu = unique(partitionSummary(:,1)); +len = length(npopsTaulu); +probs = zeros(len,1); +partitionSummary(:,2) = partitionSummary(:,2)-max(partitionSummary(:,2)); +sumtn = sum(exp(partitionSummary(:,2))); +for i=1:len + npopstn = sum(exp(partitionSummary(find(partitionSummary(:,1)==npopsTaulu(i)),2))); + probs(i) = npopstn / sumtn; +end +for i=1:len + if probs(i)>1e-5 + line = [num2str(npopsTaulu(i)) ': ' num2str(probs(i))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end + end +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); + +%-------------------------------------------------------------- + +function num2 = omaRound(num) +% Py�rist�� luvun num 1 desimaalin tarkkuuteen +num = num*10; +num = round(num); +num2 = num/10; + +%--------------------------------------------------------- + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:inï¿? +% yks t�ytyy olla kokonaisluku, joka on +% v�hint��n -1:n suuruinen. Pienemmillï¿? +% luvuilla tapahtuu jokin py�ristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + + +%----------------------------------------------- + + +function ninds = testaaOnkoKunnollinenBapsData(data) +%Tarkastaa onko viimeisessï¿?sarakkeessa kaikki +%luvut 1,2,...,n johonkin n:��n asti. +%Tarkastaa lis�ksi, ettï¿?on v�hint��n 2 saraketta. +if size(data,1)<2 + ninds = 0; return; +end +lastCol = data(:,end); +ninds = max(lastCol); +if ~isequal((1:ninds)',unique(lastCol)) + ninds = 0; return; +end + + + +%-------------------------------------------------------------------------- +function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? +% kyseinen rivi on peräisin. Funktio tutkii ensin, ett?montako +% rivi?maksimissaan on peräisin yhdelt?yksilölt? jolloin saadaan +% tietää onko kyseess?haploidi, diploidi jne... Tämän jälkeen funktio +% lisää tyhji?rivej?niille yksilöille, joilta on peräisin vähemmän +% rivej?kuin maksimimäär? +% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin +% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytöss?oleva koodi. +% Tämän jälkeen funktio muuttaa alleelikoodit siten, ett?yhden lokuksen j +% koodit saavat arvoja välill?1,...,noalle(j). + +% added by Lu Cheng, without modification, 16.02.2010 + +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +% dataApu = []; +% nollat = []; +% isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + %alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(logical(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + % data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + data(logical(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +nrows = size(data,1); +ncols = size(data,2); +rowsFromInd = zeros(nind,1); +for i=1:nind + rowsFromInd(i) = length(find(data(:,end)==i)); +end +maxRowsFromInd = max(rowsFromInd); +a = -999; +emptyRow = repmat(a, 1, ncols); +lessThanMax = find(rowsFromInd < maxRowsFromInd); +missingRows = maxRowsFromInd*nind - nrows; +data = [data; zeros(missingRows, ncols)]; +pointer = 1; +for ind=lessThanMax' %Käy läpi ne yksilöt, joilta puuttuu rivej? + miss = maxRowsFromInd-rowsFromInd(ind); % Tält?yksilölt?puuttuvien lkm. + for j=1:miss + rowToBeAdded = emptyRow; + rowToBeAdded(end) = ind; + data(nrows+pointer, :) = rowToBeAdded; + pointer = pointer+1; + end +end +data = sortrows(data, ncols); % Sorttaa yksilöiden mukaisesti +newData = data; +rowsFromInd = maxRowsFromInd; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + + +%------------------------------------------------------------------------- + +function [Z, dist] = newGetDistances(data, rowsFromInd) + +ninds = max(data(:,end)); +nloci = size(data,2)-1; +riviLkm = nchoosek(double(ninds),2); + +% empties = find(data<0); +% data(empties)=0; +data(logical(data<0)) = 0; +data = uint16(data); + +pariTaulu = zeros(riviLkm,2); +aPointer=1; + +for a=1:ninds-1 + pariTaulu(aPointer:aPointer+double(ninds-1-a),1) = ones(ninds-a,1,'uint16')*a; + pariTaulu(aPointer:aPointer+double(ninds-1-a),2) = uint16((a+1:ninds)'); + aPointer = aPointer+double(ninds-a); +end + +eka = pariTaulu(:,ones(1,rowsFromInd)); +eka = eka * rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]); +eka = eka - miinus; + +toka = pariTaulu(:,ones(1,rowsFromInd)*2); +toka = toka * rowsFromInd; +toka = toka - miinus; + +eka = uint16(eka); +toka = uint16(toka); + +clear pariTaulu; clear miinus; + +summa = uint16(zeros(riviLkm,1)); +vertailuja = uint16(zeros(riviLkm,1)); + +x = zeros(size(eka)); x = uint16(x); +y = zeros(size(toka)); y = uint16(y); +% fprintf(1,'%%10'); +for j=1:nloci; + + for k=1:rowsFromInd + x(:,k) = data(eka(:,k),j); + y(:,k) = data(toka(:,k),j); + end + + for a=1:rowsFromInd + for b=1:rowsFromInd + vertailutNyt = uint16(x(:,a)>0 & y(:,b)>0); + vertailuja = vertailuja + vertailutNyt; + lisays = (x(:,a)~=y(:,b) & vertailutNyt); + summa = summa + uint16(lisays); + end + end + % fprintf(1,'\b\b'); + % fprintf(1,'%d',floor(10+80*j/nloci)); +end + +clear x; clear y; clear vertailutNyt; +clear eka; clear toka; clear data; clear lisays; +dist = zeros(length(vertailuja),1); +% nollat = find(vertailuja==0); +% dist(nollat) = 1; +dist(logical(vertailuja==0)) = 1; +muut = find(vertailuja>0); +dist(muut) = double(summa(muut))./double(vertailuja(muut)); +clear summa; clear vertailuja; clear muut; + +Z = computeLinkage(dist'); \ No newline at end of file diff --git a/matlab/linkage/CSEFlagDialog.m b/matlab/linkage/CSEFlagDialog.m new file mode 100644 index 0000000..c03b0e6 --- /dev/null +++ b/matlab/linkage/CSEFlagDialog.m @@ -0,0 +1,404 @@ +function out = CSEFlagDialog(items, varargin) +%CSEFLAGDIALOG makes a GUI dialog to select from options of diverse types +% (checkboxes, radiobuttons, text-inputs and popup lists). Some options can +% be linked, i.e. be mutually exclusive or be only enabled according to the +% value of another option. +% It is an extension of CSEOptionDialog. +% +% out = CSEFlagDialog(items) +% out = CSEFlagDialog(items, title) +% out = CSEFlagDialog(items, title, msg) +% out = CSEFlagDialog(items, title, msg, options) +% +% items is an array structure containing the options, in the following fields: +% .name is a string (the name of the option) +% .values is a cell array of possible answer. +% if empty, it is considered to contain "Yes" and "No", i.e. +% the option is a checkbox or a radio-button. +% if it contains only one numeric element, it is considered a +% header, and not an active control. +% if it contains only one text element, it is considered to be a +% text input field, with the .values as default value. +% Otherwise, it is a popup list with the choices given in values. +% .linked [optional field] is an array of index in items indicating which +% options are linked with the option. Linked options +% will be grayed out when the option is set to "No". +% If .linked contains negative elements, those will be grayed +% out when the option is set to "Yes". +% .exclusive [optional field] is an array of index in items indicating which +% options are mutually exclusive with each option. When the option +% is set to "Yes", mutually exclusive options are set to "No". If +% the field does not exist, or is empty, the control is a checkbox +% otherwise it is a radio-button. +% Both .linked and .exclusive behaviour are only implemented for +% "Yes"/"No" fields - but any field can be in "linked" and will +% be grayed out. +% .default [optional field] is an integer indicating the default value for +% each option (0: No, 1: Yes, -1: grayed out). Note that in the +% case of text input field, this field is irrelevant and the default +% is given in the "values" field; In the case of popup lists, the +% default is an index in .values, or -1 (grayed out). +% "Linked" and "Exclusive" are NOT evaluated in the initial layout +% of the dialog, hence the default must be consequent with the rules, +% e.g. linked fields of a "No" field must be grayed out. +% .indent [optional field] allows an indentation (proportional to the value +% of this field) from the position of the control. May be used +% to graphically make "groups" as no "frames" are used. +% .help [optional field] contains tooltips help texts. Can contain +% "\n" to make multi-line help fields. +% title is the window title (string) +% msg is a string appearing on the top of the dialog +% options is an optional structure containing options: +% .center = 0|1 (center text msg) +% .bold = 'light'|'normal'|'demi'|'bold' +% indicates how headers (see .values) must be printed out. +% .fixed = 0|1: FixedWidth font for lists +% [more to come in future versions] +% +% The controls will be display in the order they appear in items. +% +% out contains an array of structure of answers: +% out(i).answer = value of the control. +% values are: +% 1 or 0 for "Yes"/"No" controls, +% the index of the chosen item for list controls. +% the text given for text inputs. +% out is empty if cancel was chosen, negative integer if error. +% +% Written by L.Cavin, 07.12.2003, (c) CSE +% This code is free to use and modify for non-commercial purposes. +% Web address: http://ltcmail.ethz.ch/cavin/CSEDBLib.html#FLAGDIALOG +% +% ===================================================================== +% An example of usage is: (in items, elements not mentionned are empty; +% e.g. items(1).linked is empty). +% items(1).name = 'Contact:'; +% items(1).default = 0; +% items(1).values = {'email@address'}; +% items(1).help = 'Enter your email address.'; +% items(2).name = 'I will be coming!'; +% items(2).default = 1; +% items(2).linked = [3 4 5 6]; +% items(3).name = 'With my family'; +% items(3).default = 1; +% items(3).exclusive = 4; +% items(3).indent = 1; +% items(4).name = 'Alone'; +% items(4).default = 0; +% items(4).exclusive = 3; +% items(4).indent = 1; +% items(5).name = 'Transportation:'; +% items(5).indent = 1; +% items(5).values = {1}; +% items(6).name = 'Coming by'; +% items(6).default = 1; +% items(6).indent = 2; +% items(6).values = {'Train'; 'Bus'; 'Foot'}; +% items(6).help = 'Cars are polluting.\nUse public transportation whenever possible!'; +% items(7).name = 'I''ll sure give a phone call!'; +% items(7).default = 0; +% +% title = 'Birthday party incsription'; +% +% msg = sprintf(['Dear friends,\nAs you all know, I am turning 30 next april.\nThis ' ... +% 'seems a worthy occasion to party a bit!\n\nWill you be able to ' ... +% 'attend?']); +% +% out = CSEFlagDialog(items, title, msg) + + +persistent handles; + +if ischar(items) + % this ain't no cell, so must be a callback + callback_type = items; + out = []; +elseif ~isa(items, 'struct') + % this is an error + warn('CSE:Dialog', 'The first parameter must be a stucture of items.'); + out = -1; + return; +else + % we must generate the dialog + callback_type = 'create_dialog'; +end + +% first we do the callbacks, then we handle the initial call. +switch callback_type + case 'item_click' % this is an action on a control + ctrl_idx = get(gcbo, 'UserData'); + % should we "zero" other controls? + if ~isempty(handles.items(ctrl_idx).exclusive) & get(gcbo, 'Value')==1 + for i = 1:length(handles.items(ctrl_idx).exclusive) + set(handles.items_obj(handles.items(ctrl_idx).exclusive(i)), 'Value', 0); + end + end + % should we disable or enable other elements + if ~isempty(handles.items(ctrl_idx).linked) + if get(gcbo, 'Value')==1 + % enable + en_st = 'on'; + ds_st = 'off'; + else + % disable + en_st = 'off'; + ds_st = 'on'; + end + for i = 1:length(handles.items(ctrl_idx).linked) + tp = handles.items(ctrl_idx).linked(i); + if tp > 0 + set(handles.items_obj(tp), 'Enable', en_st); + if handles.head_obj(tp) > 0 + set(handles.head_obj(tp), 'Enable', en_st); + end + else + tp = -1 * tp; + set(handles.items_obj(tp), 'Enable', ds_st); + if handles.head_obj(tp) > 0 + set(handles.head_obj(tp), 'Enable', ds_st); + end + end + end + end + case 'ok_click' % this is a click on the OK button + for i = 1:length(handles.items_obj) + if strcmp(get(handles.items_obj(i), 'Style'), 'edit') + out(i).answer = cell2mat(get(handles.items_obj(i), 'String')); + else + out(i).answer = get(handles.items_obj(i), 'Value'); + end + end + set(gcbo,'userdata',out); + case 'cancel_click' % click on cancel button + out = []; + if ~isempty(handles) + set(handles.ok_button,'userdata',out); + else handles = get(0,'CurrentFigure'); + delete(handles); + end + case 'create_dialog' % now this is actually the initial call + options = []; + msg = ''; + title = 'CSE Flag Dialog'; + if nargin > 1 + title = varargin{1}; + if nargin > 2 + msg = varargin{2}; + if nargin > 3 + options = varargin{3}; + end + end + end + handles.options = options; + % precompute width and height of dialog: + % width is 10 + the longest chain of characters + 10; + % height is 1 + the number of lines in msg + 2 + number of items*2 + % + 2 + 2 + 2 (sum: ... + 9) + longest_chain = 0; + item_length = 0; + for i = 1:length(items) + if ~isfield(items(i), 'indent') | isempty(items(i).indent) + items(i).indent = 0; + end + if ~isfield(items(i), 'help') + items(i).help = []; + end + if ~isfield(items(i), 'linked') + items(i).linked = []; + end + if ~isfield(items(i), 'default') + items(i).default = []; + end + if ~isfield(items(i), 'exclusive') + items(i).exclusive = []; + end + val_length = 0; + if length(items(i).values) > 0 + for j = 1:length(items(i).values) + if length(items(i).values{j}) > val_length + val_length = length(items(i).values{j}); + end + end + if length(items(i).values) == 1 + val_length = val_length + 10; + end + end + items(i).mxlgt = val_length; + if length(items(i).name)+val_length+items(i).indent*4 > item_length + item_length = length(items(i).name)+val_length+items(i).indent*4; + end + if length(items(i).name)+val_length+items(i).indent*4 > longest_chain-10 + longest_chain = length(items(i).name)+10+val_length+items(i).indent*4; + end + end + handles.items = items; + % a = [regexp(msg, '\n') length(msg)]; % for > R13 + a = [strfind(msg,sprintf('\n')) length(msg)]; % for R12 + for i = 2:length(a) + if a(i)-a(i-1) > longest_chain + longest_chain = a(i)-a(i-1); + end + end + dial_width = max(longest_chain + 20, 50); + if isfield(handles.options, 'fixed') + % much wider on average... + dial_width = dial_width * 1.25; + end + dial_height = 9 + length(a)-1 + length(items)*2; + if length(msg) == 0 + dial_height = dial_height -2; + end + item_length = (dial_width-item_length-10)/2; + + % A) create window: (invisible, for now) + handles.dialog = dialog( 'Visible', 'off', ... + 'Units', 'characters', ... + 'Position', [10 10 dial_width dial_height], ... + 'CloseRequestFcn', 'CSEFlagDialog(''cancel_click'');', ... + 'WindowStyle', 'modal', ... + 'Name', title); + % COMPATIBILITY TOWARDS R12: repeating instructions for safety: + set(handles.dialog, 'Units', 'characters'); + set(handles.dialog, 'Position', [10 10 dial_width dial_height]); + + % B) create buttons: + handles.ok_button = uicontrol( 'Units', 'characters', ... + 'Parent', handles.dialog, ... + 'Position', [dial_width/2-20 2 10 2], ... + 'Callback', 'CSEFlagDialog(''ok_click'');', ... + 'String', 'OK' ); + handles.cancel_button = uicontrol( 'Units', 'characters', ... + 'Parent', handles.dialog, ... + 'Position', [dial_width/2+10 2 10 2], ... + 'Callback', 'CSEFlagDialog(''cancel_click'');', ... + 'String', 'Cancel' ); + + % C) create items: + for i = 1:length(items) + pos = length(items)-i+1; + if isempty(items(i).values) + ftype = 'radiobutton'; + if isempty(items(i).exclusive) + ftype = 'checkbox'; + end + handles.head_obj(i) = -1; + handles.items_obj(i) = uicontrol( 'Style', ftype, ... + 'Parent', handles.dialog, ... + 'Units', 'characters', ... + 'Position', [items(i).indent*4+item_length 4+2*pos length(items(i).name)+10 1], ... + 'Callback', 'CSEFlagDialog(''item_click'');', ... + 'UserData', i, ... + 'String', items(i).name ); + if ~isempty(items(i).default) & items(i).default > 0 + if strcmp(get(handles.items_obj(i), 'Style'), 'popupmenu') + if items(i).default > length(get(handles.items_obj(i), 'String')) + warn('CSE:Dialog', 'Initial value impossible for listbox %s.', items(i).name); + items(i).default = length(get(handles.items_obj(i), 'String')); + end + end + set(handles.items_obj(i), 'Value', items(i).default); + end + else + if length(items(i).values) == 1 + if isnumeric(items(i).values{1}) + ftype = 'none'; + else + ftype = 'edit'; + end + else + ftype = 'popupmenu'; + end + % needs a "header": + handles.head_obj(i) = uicontrol( 'Style', 'text', ... + 'Parent', handles.dialog, ... + 'Units', 'characters', ... + 'Position', [items(i).indent*4+item_length 4+2*pos (length(items(i).name)+2)*1.5 1], ... + 'String', items(i).name, ... + 'HorizontalAlignment', 'left'); + if ~strcmp(ftype, 'none') + handles.items_obj(i) = uicontrol( 'Style', ftype, ... + 'Parent', handles.dialog, ... + 'BackgroundColor', 'white', ... + 'Units', 'characters', ... + 'Position', [items(i).indent*4+item_length+length(items(i).name)+3 4+2*pos-0.2 items(i).mxlgt+10 1.4], ... + 'Callback', 'CSEFlagDialog(''item_click'');', ... + 'UserData', i, ... + 'String', items(i).values, ... + 'HorizontalAlignment', 'left' ); + if ~isempty(items(i).default) & items(i).default > 0 + if strcmp(get(handles.items_obj(i), 'Style'), 'popupmenu') + if items(i).default > length(get(handles.items_obj(i), 'String')) + warn('CSE:Dialog', 'Initial value impossible for listbox %s.', items(i).name); + items(i).default = length(get(handles.items_obj(i), 'String')); + end + end + set(handles.items_obj(i), 'Value', items(i).default); + end + if isfield(handles.options, 'fixed') + set(handles.items_obj(i), 'FontName', 'FixedWidth'); + set(handles.items_obj(i), 'Position', [items(i).indent*4+item_length+length(items(i).name)+3 4+2*pos-0.2 (items(i).mxlgt+10)*1.3 1.4]); + end + else + % was actually the "header" :-) + if isfield(handles.options, 'bold') + bld = handles.options.bold; + else + bld = 'normal'; + end + set(handles.head_obj(i), 'FontWeight', bld); + handles.items_obj(i) = handles.head_obj(i); + end + end + if ~isempty(items(i).default) & items(i).default < 0 + set(handles.head_obj(i), 'Enable', 'off'); % RECENTLY ADDED + set(handles.items_obj(i), 'Enable', 'off'); + end + if ~isempty(items(i).help) + set(handles.items_obj(i), 'Tooltip', sprintf(items(i).help)); + end + end + + % D) create message: + if length(msg) > 0 + if isfield(handles.options, 'center') & handles.options.center == 1 + algn = 'center'; + else + algn = 'left'; + end + handles.text = uicontrol( 'Units', 'characters', ... + 'HorizontalAlignment', algn, ... + 'Style', 'text', ... + 'Parent', handles.dialog, ... + 'Position', [5 dial_height-length(a)-1 longest_chain+10 length(a)], ... + 'String', msg ); + end + + % E) "center" and show dialog: + movegui(handles.dialog, 'center'); + set(handles.dialog, 'Visible', 'on'); + drawnow % NB! for R14 + % F) run dialog: + waitfor(handles.ok_button,'userdata'); + + % G) finish call: + out = get(handles.ok_button,'userdata'); + delete(handles.dialog); + clear handles; + + otherwise + % this is a mistake + warn('CSE:Dialog', 'Unknown Callback. Even if there is only one option, it must be passed as a cell.'); + out = -1; +end + + +%=====HELPER FUNCTION +function warn(tag, msg, varargin) +% patch for the warning function to make R13 calls compatible with R12 + +if str2num(version('-release')) < 13 + warning(sprintf(msg, varargin{:})); +else + warning(tag, msg, varargin{:}); +end + \ No newline at end of file diff --git a/matlab/linkage/CSEFlagDialog1.m b/matlab/linkage/CSEFlagDialog1.m new file mode 100644 index 0000000..f7274c7 --- /dev/null +++ b/matlab/linkage/CSEFlagDialog1.m @@ -0,0 +1,404 @@ +function out = CSEFlagDialog1(items, varargin) +%CSEFLAGDIALOG makes a GUI dialog to select from options of diverse types +% (checkboxes, radiobuttons, text-inputs and popup lists). Some options can +% be linked, i.e. be mutually exclusive or be only enabled according to the +% value of another option. +% It is an extension of CSEOptionDialog. +% +% out = CSEFlagDialog(items) +% out = CSEFlagDialog(items, title) +% out = CSEFlagDialog(items, title, msg) +% out = CSEFlagDialog(items, title, msg, options) +% +% items is an array structure containing the options, in the following fields: +% .name is a string (the name of the option) +% .values is a cell array of possible answer. +% if empty, it is considered to contain "Yes" and "No", i.e. +% the option is a checkbox or a radio-button. +% if it contains only one numeric element, it is considered a +% header, and not an active control. +% if it contains only one text element, it is considered to be a +% text input field, with the .values as default value. +% Otherwise, it is a popup list with the choices given in values. +% .linked [optional field] is an array of index in items indicating which +% options are linked with the option. Linked options +% will be grayed out when the option is set to "No". +% If .linked contains negative elements, those will be grayed +% out when the option is set to "Yes". +% .exclusive [optional field] is an array of index in items indicating which +% options are mutually exclusive with each option. When the option +% is set to "Yes", mutually exclusive options are set to "No". If +% the field does not exist, or is empty, the control is a checkbox +% otherwise it is a radio-button. +% Both .linked and .exclusive behaviour are only implemented for +% "Yes"/"No" fields - but any field can be in "linked" and will +% be grayed out. +% .default [optional field] is an integer indicating the default value for +% each option (0: No, 1: Yes, -1: grayed out). Note that in the +% case of text input field, this field is irrelevant and the default +% is given in the "values" field; In the case of popup lists, the +% default is an index in .values, or -1 (grayed out). +% "Linked" and "Exclusive" are NOT evaluated in the initial layout +% of the dialog, hence the default must be consequent with the rules, +% e.g. linked fields of a "No" field must be grayed out. +% .indent [optional field] allows an indentation (proportional to the value +% of this field) from the position of the control. May be used +% to graphically make "groups" as no "frames" are used. +% .help [optional field] contains tooltips help texts. Can contain +% "\n" to make multi-line help fields. +% title is the window title (string) +% msg is a string appearing on the top of the dialog +% options is an optional structure containing options: +% .center = 0|1 (center text msg) +% .bold = 'light'|'normal'|'demi'|'bold' +% indicates how headers (see .values) must be printed out. +% .fixed = 0|1: FixedWidth font for lists +% [more to come in future versions] +% +% The controls will be display in the order they appear in items. +% +% out contains an array of structure of answers: +% out(i).answer = value of the control. +% values are: +% 1 or 0 for "Yes"/"No" controls, +% the index of the chosen item for list controls. +% the text given for text inputs. +% out is empty if cancel was chosen, negative integer if error. +% +% Written by L.Cavin, 07.12.2003, (c) CSE +% This code is free to use and modify for non-commercial purposes. +% Web address: http://ltcmail.ethz.ch/cavin/CSEDBLib.html#FLAGDIALOG +% +% ===================================================================== +% An example of usage is: (in items, elements not mentionned are empty; +% e.g. items(1).linked is empty). +% items(1).name = 'Contact:'; +% items(1).default = 0; +% items(1).values = {'email@address'}; +% items(1).help = 'Enter your email address.'; +% items(2).name = 'I will be coming!'; +% items(2).default = 1; +% items(2).linked = [3 4 5 6]; +% items(3).name = 'With my family'; +% items(3).default = 1; +% items(3).exclusive = 4; +% items(3).indent = 1; +% items(4).name = 'Alone'; +% items(4).default = 0; +% items(4).exclusive = 3; +% items(4).indent = 1; +% items(5).name = 'Transportation:'; +% items(5).indent = 1; +% items(5).values = {1}; +% items(6).name = 'Coming by'; +% items(6).default = 1; +% items(6).indent = 2; +% items(6).values = {'Train'; 'Bus'; 'Foot'}; +% items(6).help = 'Cars are polluting.\nUse public transportation whenever possible!'; +% items(7).name = 'I''ll sure give a phone call!'; +% items(7).default = 0; +% +% title = 'Birthday party incsription'; +% +% msg = sprintf(['Dear friends,\nAs you all know, I am turning 30 next april.\nThis ' ... +% 'seems a worthy occasion to party a bit!\n\nWill you be able to ' ... +% 'attend?']); +% +% out = CSEFlagDialog(items, title, msg) + + +persistent handles; + +if ischar(items) + % this ain't no cell, so must be a callback + callback_type = items; + out = []; +elseif ~isa(items, 'struct') + % this is an error + warn('CSE:Dialog', 'The first parameter must be a stucture of items.'); + out = -1; + return; +else + % we must generate the dialog + callback_type = 'create_dialog'; +end + +% first we do the callbacks, then we handle the initial call. +switch callback_type + case 'item_click' % this is an action on a control + ctrl_idx = get(gcbo, 'UserData'); + % should we "zero" other controls? + if ~isempty(handles.items(ctrl_idx).exclusive) & get(gcbo, 'Value')==1 + for i = 1:length(handles.items(ctrl_idx).exclusive) + set(handles.items_obj(handles.items(ctrl_idx).exclusive(i)), 'Value', 0); + end + end + % should we disable or enable other elements + if ~isempty(handles.items(ctrl_idx).linked) + if get(gcbo, 'Value')==1 || get(gcbo, 'Value')==2 + % enable + en_st = 'on'; + ds_st = 'off'; + else + % disable + en_st = 'off'; + ds_st = 'on'; + end + for i = 1:length(handles.items(ctrl_idx).linked) + tp = handles.items(ctrl_idx).linked(i); + if tp > 0 + set(handles.items_obj(tp), 'Enable', en_st); + if handles.head_obj(tp) > 0 + set(handles.head_obj(tp), 'Enable', en_st); + end + else + tp = -1 * tp; + set(handles.items_obj(tp), 'Enable', ds_st); + if handles.head_obj(tp) > 0 + set(handles.head_obj(tp), 'Enable', ds_st); + end + end + end + end + case 'ok_click' % this is a click on the OK button + for i = 1:length(handles.items_obj) + if strcmp(get(handles.items_obj(i), 'Style'), 'edit') + out(i).answer = cell2mat(get(handles.items_obj(i), 'String')); + else + out(i).answer = get(handles.items_obj(i), 'Value'); + end + end + set(gcbo,'userdata',out); + case 'cancel_click' % click on cancel button + out = []; + if ~isempty(handles) + set(handles.ok_button,'userdata',out); + else handles = get(0,'CurrentFigure'); + delete(handles); + end + case 'create_dialog' % now this is actually the initial call + options = []; + msg = ''; + title = 'CSE Flag Dialog'; + if nargin > 1 + title = varargin{1}; + if nargin > 2 + msg = varargin{2}; + if nargin > 3 + options = varargin{3}; + end + end + end + handles.options = options; + % precompute width and height of dialog: + % width is 10 + the longest chain of characters + 10; + % height is 1 + the number of lines in msg + 2 + number of items*2 + % + 2 + 2 + 2 (sum: ... + 9) + longest_chain = 0; + item_length = 0; + for i = 1:length(items) + if ~isfield(items(i), 'indent') | isempty(items(i).indent) + items(i).indent = 0; + end + if ~isfield(items(i), 'help') + items(i).help = []; + end + if ~isfield(items(i), 'linked') + items(i).linked = []; + end + if ~isfield(items(i), 'default') + items(i).default = []; + end + if ~isfield(items(i), 'exclusive') + items(i).exclusive = []; + end + val_length = 0; + if length(items(i).values) > 0 + for j = 1:length(items(i).values) + if length(items(i).values{j}) > val_length + val_length = length(items(i).values{j}); + end + end + if length(items(i).values) == 1 + val_length = val_length + 10; + end + end + items(i).mxlgt = val_length; + if length(items(i).name)+val_length+items(i).indent*4 > item_length + item_length = length(items(i).name)+val_length+items(i).indent*4; + end + if length(items(i).name)+val_length+items(i).indent*4 > longest_chain-10 + longest_chain = length(items(i).name)+10+val_length+items(i).indent*4; + end + end + handles.items = items; + % a = [regexp(msg, '\n') length(msg)]; % for > R13 + a = [strfind(msg,sprintf('\n')) length(msg)]; % for R12 + for i = 2:length(a) + if a(i)-a(i-1) > longest_chain + longest_chain = a(i)-a(i-1); + end + end + dial_width = max(longest_chain + 20, 50); + if isfield(handles.options, 'fixed') + % much wider on average... + dial_width = dial_width * 1.25; + end + dial_height = 9 + length(a)-1 + length(items)*2; + if length(msg) == 0 + dial_height = dial_height -2; + end + item_length = (dial_width-item_length-10)/2; + + % A) create window: (invisible, for now) + handles.dialog = dialog( 'Visible', 'off', ... + 'Units', 'characters', ... + 'Position', [10 10 dial_width dial_height], ... + 'CloseRequestFcn', 'CSEFlagDialog(''cancel_click'');', ... + 'WindowStyle', 'modal', ... + 'Name', title); + % COMPATIBILITY TOWARDS R12: repeating instructions for safety: + set(handles.dialog, 'Units', 'characters'); + set(handles.dialog, 'Position', [10 10 dial_width dial_height]); + + % B) create buttons: + handles.ok_button = uicontrol( 'Units', 'characters', ... + 'Parent', handles.dialog, ... + 'Position', [dial_width/2-20 2 10 2], ... + 'Callback', 'CSEFlagDialog1(''ok_click'');', ... + 'String', 'OK' ); + handles.cancel_button = uicontrol( 'Units', 'characters', ... + 'Parent', handles.dialog, ... + 'Position', [dial_width/2+10 2 10 2], ... + 'Callback', 'CSEFlagDialog1(''cancel_click'');', ... + 'String', 'Cancel' ); + + % C) create items: + for i = 1:length(items) + pos = length(items)-i+1; + if isempty(items(i).values) + ftype = 'radiobutton'; + if isempty(items(i).exclusive) + ftype = 'checkbox'; + end + handles.head_obj(i) = -1; + handles.items_obj(i) = uicontrol( 'Style', ftype, ... + 'Parent', handles.dialog, ... + 'Units', 'characters', ... + 'Position', [items(i).indent*4+item_length 4+2*pos length(items(i).name)+10 1], ... + 'Callback', 'CSEFlagDialog1(''item_click'');', ... + 'UserData', i, ... + 'String', items(i).name ); + if ~isempty(items(i).default) & items(i).default > 0 + if strcmp(get(handles.items_obj(i), 'Style'), 'popupmenu') + if items(i).default > length(get(handles.items_obj(i), 'String')) + warn('CSE:Dialog', 'Initial value impossible for listbox %s.', items(i).name); + items(i).default = length(get(handles.items_obj(i), 'String')); + end + end + set(handles.items_obj(i), 'Value', items(i).default); + end + else + if length(items(i).values) == 1 + if isnumeric(items(i).values{1}) + ftype = 'none'; + else + ftype = 'edit'; + end + else + ftype = 'popupmenu'; + end + % needs a "header": + handles.head_obj(i) = uicontrol( 'Style', 'text', ... + 'Parent', handles.dialog, ... + 'Units', 'characters', ... + 'Position', [items(i).indent*4+item_length 4+2*pos (length(items(i).name)+2)*1.5 1], ... + 'String', items(i).name, ... + 'HorizontalAlignment', 'left'); + if ~strcmp(ftype, 'none') + handles.items_obj(i) = uicontrol( 'Style', ftype, ... + 'Parent', handles.dialog, ... + 'BackgroundColor', 'white', ... + 'Units', 'characters', ... + 'Position', [items(i).indent*4+item_length+length(items(i).name)+3 4+2*pos-0.2 items(i).mxlgt+10 1.4], ... + 'Callback', 'CSEFlagDialog1(''item_click'');', ... + 'UserData', i, ... + 'String', items(i).values, ... + 'HorizontalAlignment', 'left' ); + if ~isempty(items(i).default) & items(i).default > 0 + if strcmp(get(handles.items_obj(i), 'Style'), 'popupmenu') + if items(i).default > length(get(handles.items_obj(i), 'String')) + warn('CSE:Dialog', 'Initial value impossible for listbox %s.', items(i).name); + items(i).default = length(get(handles.items_obj(i), 'String')); + end + end + set(handles.items_obj(i), 'Value', items(i).default); + end + if isfield(handles.options, 'fixed') + set(handles.items_obj(i), 'FontName', 'FixedWidth'); + set(handles.items_obj(i), 'Position', [items(i).indent*4+item_length+length(items(i).name)+3 4+2*pos-0.2 (items(i).mxlgt+10)*1.3 1.4]); + end + else + % was actually the "header" :-) + if isfield(handles.options, 'bold') + bld = handles.options.bold; + else + bld = 'normal'; + end + set(handles.head_obj(i), 'FontWeight', bld); + handles.items_obj(i) = handles.head_obj(i); + end + end + if ~isempty(items(i).default) & items(i).default < 0 + set(handles.head_obj(i), 'Enable', 'off'); % RECENTLY ADDED + set(handles.items_obj(i), 'Enable', 'off'); + end + if ~isempty(items(i).help) + set(handles.items_obj(i), 'Tooltip', sprintf(items(i).help)); + end + end + + % D) create message: + if length(msg) > 0 + if isfield(handles.options, 'center') & handles.options.center == 1 + algn = 'center'; + else + algn = 'left'; + end + handles.text = uicontrol( 'Units', 'characters', ... + 'HorizontalAlignment', algn, ... + 'Style', 'text', ... + 'Parent', handles.dialog, ... + 'Position', [5 dial_height-length(a)-1 longest_chain+10 length(a)], ... + 'String', msg ); + end + + % E) "center" and show dialog: + movegui(handles.dialog, 'center'); + set(handles.dialog, 'Visible', 'on'); + drawnow % NB! for R14 + % F) run dialog: + waitfor(handles.ok_button,'userdata'); + + % G) finish call: + out = get(handles.ok_button,'userdata'); + delete(handles.dialog); + clear handles; + + otherwise + % this is a mistake + warn('CSE:Dialog', 'Unknown Callback. Even if there is only one option, it must be passed as a cell.'); + out = -1; +end + + +%=====HELPER FUNCTION +function warn(tag, msg, varargin) +% patch for the warning function to make R13 calls compatible with R12 + +if str2num(version('-release')) < 13 + warning(sprintf(msg, varargin{:})); +else + warning(tag, msg, varargin{:}); +end + \ No newline at end of file diff --git a/matlab/linkage/allfreqsnew.m b/matlab/linkage/allfreqsnew.m new file mode 100644 index 0000000..086b52a --- /dev/null +++ b/matlab/linkage/allfreqsnew.m @@ -0,0 +1,63 @@ +function [counts,noalle,prior,adjprior,rawalleles,rawdata] = allfreqsnew(rawdata) +% Filename: allfreqsnew.m +% [counts,noalle,prior,adjprior,rawalleles] = allfreqsnew(rawdata) +% +% Description: +% rawdata has n rows (2 x #individuals) and n(l) first +% colums are the loci. The last column is the subpopindex +% prior is a created matrix of positive Dirichlet hyperparameters +% missing data is filtered out +% !!!NEW!!!zeros are accepted as allele codes and any negative numbers as missing data. + +% Modified by: Jing Tang + +SCALE = 1; +dime=size(rawdata); +noalle=zeros(dime(2)-1,1); +rawalleles=cell(1,dime(2)-1); +for i=1:dime(2)-1 + noalle(i)=length(unique(rawdata(:,i))); +end +for i=1:dime(2)-1 + if length(find(rawdata(:,i)<=0))>0 % filtering out the missing data + noalle(i)=noalle(i)-1; + end +end +for i=1:dime(2)-1 + rawalles=unique(rawdata(:,i)); + if rawalles(1)<=0 + rawalles(1)=-999; + end + rawalleles{i} = rawalles; % rawalleles!!! + if rawalles(1)<=0 + for j=2:noalle(i)+1 + rawdata(find(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*(j-1); + end + else + for j=1:noalle(i) + rawdata(find(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*j; + end +end +end + + +counts=zeros(max(noalle),dime(2)-1,max(rawdata(:,dime(2)))); + +for i=1:dime(1) + for j=1:dime(2)-1 + if rawdata(i,j)>0 + counts(rawdata(i,j),j,rawdata(i,dime(2)))=... + counts(rawdata(i,j),j,rawdata(i,dime(2)))+1; + end + end +end + +prior=[]; +for i=1:dime(2)-1 + prior=[prior [SCALE*ones(noalle(i),1)/noalle(i);zeros(max(noalle)-noalle(i),1)]]; +end + +adjprior=prior; +for i=1:dime(2)-1 + adjprior(:,i)=adjprior(:,i)+[zeros(noalle(i),1);ones(max(noalle)-noalle(i),1)]; +end diff --git a/matlab/linkage/allfreqsnew2.m b/matlab/linkage/allfreqsnew2.m new file mode 100644 index 0000000..6746391 --- /dev/null +++ b/matlab/linkage/allfreqsnew2.m @@ -0,0 +1,89 @@ +function [counts,noalle_est,prior,adjprior,rawalleles] = allfreqsnew2(rawdata, noalle_est) +% Filename: allfreqsnew2.m +% [counts,noalle,prior,adjprior,rawalleles] = allfreqsnew(rawdata) +% +% Description: +% rawdata has n rows (2 x #individuals) and n(l) first +% colums are the loci. The last column is the subpopindex +% prior is a created matrix of positive Dirichlet hyperparameters +% missing data is filtered out +% !!!NEW!!!zeros are accepted as allele codes and any negative numbers as missing data. + +% Modified by: Jing Tang +SCALE = 1; +dime=size(rawdata); +noalle=zeros(dime(2)-1,1); +rawalleles=cell(1,dime(2)-1); +for i=1:dime(2)-1 + noalle(i)=length(unique(rawdata(:,i))); +end +for i=1:dime(2)-1 + if length(find(rawdata(:,i)<=0))>0 + noalle(i)=noalle(i)-1; + end +end + +% Fomulate the raw data such that the value i in a entry denotes the ith +% alleles. +for i=1:dime(2)-1 + [rawalles ix iy]=unique(rawdata(:,i)); %modified by Lu Cheng + % rawalles = [1:noalle(i)]'; + if rawalles(1)<=0 + rawalles(1)=-999; + end + rawalleles{i} = rawalles; %rawalleles!!! + if rawalles(1)<0 + for j=2:noalle(i)+1 + rawdata(logical(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*(j-1); + end + else + %rawdata(:,i) = iy; + for j=1:noalle(i) + rawdata(logical(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*j; + end + end +end + +% ALLOWED_MEMORY = 50; % in unit of megabyte. +% n1 = max(noalle_est); +% n2 = dime(2)-1; +% n3 = double(max(rawdata(:,dime(2)))); +% ncells = n1*n2*n3; +% memory_used = ncells/(1024*1024); % using uint8 format. +% if memory_used < ALLOWED_MEMORY +% counts=zeros(n1,n2,n3,'uint8'); +% else +% nbatches = ceil(memory_used/ALLOWED_MEMORY); +% n3_in_batch = ceil(n3/nbatches); +% counts = cell(nbatches,1); +% for i=1:nbatches-1 +% % counts = cat(3,counts,uint16(zeros(n1,n2,n3_in_batch))); +% counts{i} = zeros(n1,n2,n3_in_batch,'uint8'); +% end +% % counts = cat(3, counts, uint16(zeros(n1,n2,n3-n3_in_batch*(nbatches-1)))); +% counts{i} = zeros(n1,n2,n3-n3_in_batch*(nbatches-1),'uint8'); +% end + + + +counts = zeros(max(noalle_est),dime(2)-1,max(rawdata(:,dime(2))),'uint8'); +for i=1:dime(1) + for j=1:dime(2)-1 + if rawdata(i,j)>0 + counts(rawdata(i,j),j,rawdata(i,dime(2)))=... + counts(rawdata(i,j),j,rawdata(i,dime(2)))+1; + end + end +end + +maxnoalle = max(noalle_est); +% prior = []; +prior=zeros(maxnoalle,dime(2)-1); +for i=1:dime(2)-1 + prior(:,i) = [SCALE*ones(noalle_est(i),1)/noalle_est(i);zeros(maxnoalle-noalle_est(i),1)]; +end + +adjprior=prior; +for i=1:dime(2)-1 + adjprior(:,i)=adjprior(:,i)+[zeros(noalle_est(i),1);ones(maxnoalle-noalle_est(i),1)]; +end diff --git a/matlab/linkage/allfreqsnew3.m b/matlab/linkage/allfreqsnew3.m new file mode 100644 index 0000000..086b52a --- /dev/null +++ b/matlab/linkage/allfreqsnew3.m @@ -0,0 +1,63 @@ +function [counts,noalle,prior,adjprior,rawalleles,rawdata] = allfreqsnew(rawdata) +% Filename: allfreqsnew.m +% [counts,noalle,prior,adjprior,rawalleles] = allfreqsnew(rawdata) +% +% Description: +% rawdata has n rows (2 x #individuals) and n(l) first +% colums are the loci. The last column is the subpopindex +% prior is a created matrix of positive Dirichlet hyperparameters +% missing data is filtered out +% !!!NEW!!!zeros are accepted as allele codes and any negative numbers as missing data. + +% Modified by: Jing Tang + +SCALE = 1; +dime=size(rawdata); +noalle=zeros(dime(2)-1,1); +rawalleles=cell(1,dime(2)-1); +for i=1:dime(2)-1 + noalle(i)=length(unique(rawdata(:,i))); +end +for i=1:dime(2)-1 + if length(find(rawdata(:,i)<=0))>0 % filtering out the missing data + noalle(i)=noalle(i)-1; + end +end +for i=1:dime(2)-1 + rawalles=unique(rawdata(:,i)); + if rawalles(1)<=0 + rawalles(1)=-999; + end + rawalleles{i} = rawalles; % rawalleles!!! + if rawalles(1)<=0 + for j=2:noalle(i)+1 + rawdata(find(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*(j-1); + end + else + for j=1:noalle(i) + rawdata(find(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*j; + end +end +end + + +counts=zeros(max(noalle),dime(2)-1,max(rawdata(:,dime(2)))); + +for i=1:dime(1) + for j=1:dime(2)-1 + if rawdata(i,j)>0 + counts(rawdata(i,j),j,rawdata(i,dime(2)))=... + counts(rawdata(i,j),j,rawdata(i,dime(2)))+1; + end + end +end + +prior=[]; +for i=1:dime(2)-1 + prior=[prior [SCALE*ones(noalle(i),1)/noalle(i);zeros(max(noalle)-noalle(i),1)]]; +end + +adjprior=prior; +for i=1:dime(2)-1 + adjprior(:,i)=adjprior(:,i)+[zeros(noalle(i),1);ones(max(noalle)-noalle(i),1)]; +end diff --git a/matlab/linkage/allfreqsnew4.m b/matlab/linkage/allfreqsnew4.m new file mode 100644 index 0000000..edf0109 --- /dev/null +++ b/matlab/linkage/allfreqsnew4.m @@ -0,0 +1,90 @@ +function [rawdata,noalle_est,prior,adjprior] = allfreqsnew4(rawdata, noalle_est) +% Filename: allfreqsnew4.m +% [counts,noalle,prior,adjprior,rawalleles] = allfreqsnew(rawdata) +% +% Description: +% rawdata has n rows (2 x #individuals) and n(l) first +% colums are the loci. The last column is the subpopindex +% prior is a created matrix of positive Dirichlet hyperparameters +% missing data is filtered out +% !!!NEW!!!zeros are accepted as allele codes and any negative numbers as missing data. + +% Modified by: Jing Tang +SCALE = 1; +dime=size(rawdata); +noalle=zeros(dime(2)-1,1); +rawalleles=cell(1,dime(2)-1); +for i=1:dime(2)-1 + noalle(i)=length(unique(rawdata(:,i))); +end +for i=1:dime(2)-1 + if length(find(rawdata(:,i)<=0))>0 + noalle(i)=noalle(i)-1; + end +end + +% Fomulate the raw data such that the value i in a entry denotes the ith +% alleles. +for i=1:dime(2)-1 + rawalles=unique(rawdata(:,i)); + % rawalles = [1:noalle(i)]'; + if rawalles(1)<=0 + rawalles(1)=-999; + end + rawalleles{i} = rawalles; %rawalleles!!! + if rawalles(1)<0 + for j=2:noalle(i)+1 + %rawdata(find(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*(j-1); + rawdata(logical(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*(j-1); + end + else + for j=1:noalle(i) + % rawdata(find(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*j; + rawdata(logical(rawdata(:,i)==rawalles(j)),i)=ones(length(find(rawdata(:,i)==rawalles(j))),1)*j; + end + end +end + +% ALLOWED_MEMORY = 50; % in unit of megabyte. +% n1 = max(noalle_est); +% n2 = dime(2)-1; +% n3 = double(max(rawdata(:,dime(2)))); +% ncells = n1*n2*n3; +% memory_used = ncells/(1024*1024); % using uint8 format. +% if memory_used < ALLOWED_MEMORY +% counts=zeros(n1,n2,n3,'uint8'); +% else +% nbatches = ceil(memory_used/ALLOWED_MEMORY); +% n3_in_batch = ceil(n3/nbatches); +% counts = cell(nbatches,1); +% for i=1:nbatches-1 +% % counts = cat(3,counts,uint16(zeros(n1,n2,n3_in_batch))); +% counts{i} = zeros(n1,n2,n3_in_batch,'uint8'); +% end +% % counts = cat(3, counts, uint16(zeros(n1,n2,n3-n3_in_batch*(nbatches-1)))); +% counts{i} = zeros(n1,n2,n3-n3_in_batch*(nbatches-1),'uint8'); +% end + + + +% counts = zeros(max(noalle_est),dime(2)-1,max(rawdata(:,dime(2))),'uint8'); +% for i=1:dime(1) +% for j=1:dime(2)-1 +% if rawdata(i,j)>0 +% counts(rawdata(i,j),j,rawdata(i,dime(2)))=... +% counts(rawdata(i,j),j,rawdata(i,dime(2)))+1; +% end +% end +% end + +maxnoalle = max(noalle_est); +% prior = []; +prior=zeros(maxnoalle,dime(2)-1); +for i=1:dime(2)-1 + prior(:,i) = [SCALE*ones(noalle_est(i),1)/noalle_est(i);zeros(maxnoalle-noalle_est(i),1)]; +end + +adjprior=prior; +for i=1:dime(2)-1 + adjprior(:,i)=adjprior(:,i)+[zeros(noalle_est(i),1);ones(maxnoalle-noalle_est(i),1)]; +end diff --git a/matlab/linkage/askSeq.m b/matlab/linkage/askSeq.m new file mode 100644 index 0000000..7b29574 --- /dev/null +++ b/matlab/linkage/askSeq.m @@ -0,0 +1,26 @@ +function [isOK,returnvalue] = askSeq() + +items(1).name = 'Yes. Continue loading the sequence profile.'; +items(1).default = 1; +items(1).exclusive = [2]; +items(1).values = []; + +items(2).name = 'No. Stop loading data.'; +items(2).default = 0; +items(2).exclusive = [1]; +items(2).values = []; + +title = 'Load Sequence?'; +msg = sprintf(['The allelic profile has been loaded.\nWould you like to continue loading the corresponding gene sequence data?']); +out = CSEFlagDialog(items, title, msg); +if ~(isempty(out)), + if(out(1).answer==1) + returnvalue = 1; + elseif(out(2).answer==1) + returnvalue = 2; + end + isOK = 1; +else + isOK = 0; + returnvalue = 0; +end \ No newline at end of file diff --git a/matlab/linkage/choosebox.m b/matlab/linkage/choosebox.m new file mode 100644 index 0000000..151315a --- /dev/null +++ b/matlab/linkage/choosebox.m @@ -0,0 +1,353 @@ +function [selection,value] = choosebox(varargin) +%CHOOSEBOX Two-listed item selection dialog box. +% [SELECTION,OK] = CHOOSEBOX('ListString',S) creates a modal dialog box +% which allows you to select a string or multiple strings from a list. +% Single or multiple strings can be transferred from a base list to a +% selection list using an arrow-button. Single strings also can be +% transferred by double-clicking; single or multiple strings are +% transferred by pressing . +% SELECTION is a vector of indices of the selected strings (length 1 +% in the single selection mode). The indices will be in the order of +% selection from the base list. If a group of multiple strings is +% selected, its order inside the group will not change, but different +% groups are ordered by their selection. +% OK is 1 if you push the OK button, or 0 if you push the Cancel +% button or close the figure. In that case SELECTION will be [], +% regardless to the actual selection list. +% Important parameter is 'ChooseMode', see list below. +% +% Inputs are in parameter,value pairs: +% +% Parameter Description +% 'ChooseMode' string; can be 'pick' or 'copy'. +% When set to 'pick', transferred string items from the +% base list box are removed. When retransferred, they +% again are listed in their initial positions. +% When set to 'copy', transferred string items remain in +% the base list and can be transferred several times. +% default is 'pick'. +% 'ListString' cell array of strings for the base list box. +% 'SelectionMode' string; can be 'single' or 'multiple'; defaults to +% 'multiple'. +% 'ListSize' [width height] of listbox in pixels; defaults +% to [160 300]. +% 'InitialValue' vector of indices of which items of the list box +% are initially selected; defaults to none []. +% 'Name' String for the figure's title. Defaults to ''. +% 'PromptString' string matrix or cell array of strings which appears +% as text above the base list box. Defaults to {}. +% 'SelectString' string matrix or cell array of strings which appears +% as text above the selection list box. Defaults to {}. +% 'OKString' string for the OK button; defaults to 'OK'. +% 'CancelString' string for the Cancel button; defaults to 'Cancel'. +% 'uh' uicontrol button height, in pixels; default = 18. +% 'fus' frame/uicontrol spacing, in pixels; default = 8. +% 'ffs' frame/figure spacing, in pixels; default = 8. +% +% Example: +% d = dir; +% str = {d.name}; +% [s,v] = choosebox('Name','File deletion',... +% 'PromptString','Files remaining in this directory:',... +% 'SelectString','Files to delete:',... +% 'ListString',str) +% +% inspired by listdlg.m from Mathworks. +% +% programmed by Peter Wasmeier, Technical University of Munich +% p.wasmeier@bv.tum.de +% 11-12-03 + +% Original listdlg file by +% T. Krauss, 12/7/95, P.N. Secakusuma, 6/10/97 +% Copyright 1984-2002 The MathWorks, Inc. +% $Revision: 1.20 $ $Date: 2002/04/09 01:36:06 $ + +% Test: d = dir;[s,v] = choosebox('Name','File deletion','PromptString','Files remaining in this directory:','SelectString','Files to delete:','ListString',{d.name}); + + + +error(nargchk(1,inf,nargin)) + +arrow=[... + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 + 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 + 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 + 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 + 0 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 0 + 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 + 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 + 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 + 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 + 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 + 0 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 0 + 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 + 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 + 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 + 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]; +for i=1:3 + rarrow(:,:,i)=arrow; + larrow(:,:,i)=fliplr(arrow); +end +clear arrow; + +figname = ''; +smode = 2; % (multiple) +cmode = 1; % remove from left hand side +promptstring = {}; +selectstring={}; +liststring = []; +listsize = [160 300]; +initialvalue = []; +okstring = 'Ok'; +cancelstring = 'Cancel'; +fus = 8; +ffs = 8; +uh = 18; + +if mod(length(varargin),2) ~= 0 + % input args have not com in pairs, woe is me + error('Arguments to LISTDLG must come param/value in pairs.') +end +for i=1:2:length(varargin) + switch lower(varargin{i}) + case 'name' + figname = varargin{i+1}; + case 'promptstring' + promptstring = varargin{i+1}; + case 'selectstring' + selectstring = varargin{i+1}; + case 'selectionmode' + switch lower(varargin{i+1}) + case 'single' + smode = 1; + case 'multiple' + smode = 2; + end + case 'choosemode' + switch lower(varargin{i+1}) + case 'pick' + cmode = 1; + case 'copy' + cmode = 2; + end + case 'listsize' + listsize = varargin{i+1}; + case 'liststring' + liststring = varargin{i+1}; + case 'initialvalue' + initialvalue = varargin{i+1}; + case 'uh' + uh = varargin{i+1}; + case 'fus' + fus = varargin{i+1}; + case 'ffs' + ffs = varargin{i+1}; + case 'okstring' + okstring = varargin{i+1}; + case 'cancelstring' + cancelstring = varargin{i+1}; + otherwise + error(['Unknown parameter name passed to LISTDLG. Name was ' varargin{i}]) + end +end + +if isstr(promptstring) + promptstring = cellstr(promptstring); +end + +if isstr(selectstring) + selectstring = cellstr(selectstring); +end + +if isempty(initialvalue) + initialvalue = 1; +end + +if isempty(liststring) + error('ListString parameter is required.') +end + +ex = get(0,'defaultuicontrolfontsize')*3.0; % height extent per line of uicontrol text (approx) + % Default: 1.7 +fp = get(0,'defaultfigureposition'); +w = 4*fus +2*ffs+2*listsize(1)+50; +h = 2*ffs+7*fus+ex*length(promptstring)+listsize(2)+2*uh; +fp = [fp(1) fp(2)+fp(4)-h w h]; % keep upper left corner fixed + +fig_props = { ... + 'name' figname ... + 'resize' 'off' ... + 'numbertitle' 'off' ... + 'menubar' 'none' ... + 'windowstyle' 'modal' ... + 'visible' 'off' ... + 'createfcn' '' ... + 'position' fp ... + 'closerequestfcn' 'delete(gcbf)' ... + }; + +ad.fromstring=cellstr(liststring); +ad.tostring=''; +ad.pos_left=[1:size(ad.fromstring,2)]'; +ad.pos_right=[]; +ad.value=0; +ad.cmode=cmode; +setappdata(0,'ListDialogAppData',ad) + +fig = figure(fig_props{:}); + +uicontrol('style','frame',... + 'position',[1 1 fp([3 4])]) +uicontrol('style','frame',... + 'position',[ffs ffs 2*fus+listsize(1) 2*fus+uh]) +uicontrol('style','frame',... + 'position',[ffs+2*fus+50+listsize(1) ffs 2*fus+listsize(1) 2*fus+uh]) +uicontrol('style','frame',... + 'position',[ffs ffs+3*fus+uh 2*fus+listsize(1) ... + listsize(2)+3*fus+ex*length(promptstring)+(uh+fus)*(smode==2)]) +uicontrol('style','frame',... + 'position',[ffs+2*fus+50+listsize(1) ffs+3*fus+uh 2*fus+listsize(1) ... + listsize(2)+3*fus+ex*length(promptstring)+(uh+fus)*(smode==2)]) + +if length(promptstring)>0 + prompt_text = uicontrol('style','text','string',promptstring,... + 'horizontalalignment','left','units','pixels',... + 'position',[ffs+fus fp(4)-(ffs+fus+ex*length(promptstring)) ... + listsize(1) ex*length(promptstring)]); +end +if length(selectstring)>0 + select_text = uicontrol('style','text','string',selectstring,... + 'horizontalalignment','left','units','pixels',... + 'position',[ffs+3*fus+listsize(1)+50 fp(4)-(ffs+fus+ex*length(promptstring)) ... + listsize(1) ex*length(selectstring)]); +end + +btn_wid = listsize(1); + +leftbox = uicontrol('style','listbox',... + 'position',[ffs+fus ffs+uh+4*fus listsize(1) listsize(2)+25],... + 'string',ad.fromstring,... + 'backgroundcolor','w',... + 'max',2,... + 'tag','leftbox',... + 'value',initialvalue, ... + 'callback',{@doFromboxClick}); + +rightbox = uicontrol('style','listbox',... + 'position',[ffs+3*fus+listsize(1)+50 ffs+uh+4*fus listsize(1) listsize(2)+25],... + 'string',ad.tostring,... + 'backgroundcolor','w',... + 'max',2,... + 'tag','rightbox',... + 'value',[], ... + 'callback',{@doToboxClick}); + +ok_btn = uicontrol('style','pushbutton',... + 'string',okstring,... + 'position',[ffs+fus ffs+fus btn_wid uh],... + 'callback',{@doOK}); + +cancel_btn = uicontrol('style','pushbutton',... + 'string',cancelstring,... + 'position',[ffs+3*fus+btn_wid+50 ffs+fus btn_wid uh],... + 'callback',{@doCancel}); + +toright_btn = uicontrol('style','pushbutton',... + 'position',[ffs+2*fus+10+listsize(1) ffs+uh+4*fus+(smode==2)*(fus+uh)+listsize(2)/2-25 30 30],... + 'cdata',rarrow,... + 'callback',{@doRight}); + +toleft_btn = uicontrol('style','pushbutton',... + 'position',[ffs+2*fus+10+listsize(1) ffs+uh+4*fus+(smode==2)*(fus+uh)+listsize(2)/2+25 30 30],... + 'cdata',larrow,... + 'callback',{@doLeft}); + + +try + set(fig, 'visible','on'); + uiwait(fig); +catch + if ishandle(fig) + delete(fig) + end +end + +if isappdata(0,'ListDialogAppData') + ad = getappdata(0,'ListDialogAppData'); + selection = ad.pos_right; + value = ad.value; + rmappdata(0,'ListDialogAppData') +else + % figure was deleted + selection = []; + value = 0; +end + +function doOK(varargin) +ad=getappdata(0,'ListDialogAppData'); +ad.value = 1; +setappdata(0,'ListDialogAppData',ad) +delete(gcbf); + +function doCancel(varargin) +ad.value = 0; +ad.pos_right = []; +setappdata(0,'ListDialogAppData',ad) +delete(gcbf); + +function doFromboxClick(varargin) +% if this is a doubleclick, doOK +if strcmp(get(gcbf,'SelectionType'),'open') + doRight; +end + +function doToboxClick(varargin) +% if this is a doubleclick, doOK +if strcmp(get(gcbf,'SelectionType'),'open') + doLeft; +end + +function doRight(varargin) +ad=getappdata(0,'ListDialogAppData'); +leftbox=findobj('Tag','leftbox'); +rightbox=findobj('Tag','rightbox'); +selection=get(leftbox,'Value'); +ad.pos_right=[ad.pos_right;ad.pos_left(selection)]; +ad.tostring=[ad.tostring ad.fromstring(selection)]; +if ad.cmode==1 % remove selected items + ad.pos_left(selection)=[]; + ad.fromstring(selection)=[]; +end +setappdata(0,'ListDialogAppData',ad) +set(leftbox,'String',ad.fromstring,'Value',[]); +set(rightbox,'String',ad.tostring,'Value',[]); + +function doLeft(varargin) +ad=getappdata(0,'ListDialogAppData'); +leftbox=findobj('Tag','leftbox'); +rightbox=findobj('Tag','rightbox'); +selection=get(rightbox,'Value'); +if ad.cmode==1 % if selected items had been removed + % Sort in the items from right hand side again + for i=1:length(selection) + next_item=min(find(ad.pos_left>ad.pos_right(selection(i)))); + if isempty(next_item) % Inserting item is last one + ad.pos_left(end+1)=ad.pos_right(selection(i)); + ad.fromstring(end+1)=ad.tostring(selection(i)); + elseif next_item==ad.pos_left(1) % Inserting item is first one + ad.pos_left=[ad.pos_right(selection(i));ad.pos_left]; + ad.fromstring=[ad.tostring(selection(i)) ad.fromstring]; + else % Inserting item is anywhere in the middle + ad.pos_left=[ad.pos_left(1:next_item-1);ad.pos_right(selection(i));ad.pos_left(next_item:end)]; + ad.fromstring=[ad.fromstring(1:next_item-1) ad.tostring(selection(i)) ad.fromstring(next_item:end)]; + end + end +end +ad.pos_right(selection)=[]; +ad.tostring(selection)=[]; +setappdata(0,'ListDialogAppData',ad) +set(leftbox,'String',ad.fromstring,'Value',[]); +set(rightbox,'String',ad.tostring,'Value',[]); \ No newline at end of file diff --git a/matlab/linkage/dendrogram_alpha.m b/matlab/linkage/dendrogram_alpha.m new file mode 100644 index 0000000..94af715 --- /dev/null +++ b/matlab/linkage/dendrogram_alpha.m @@ -0,0 +1,379 @@ +function [T,v] = dendrogram_alpha(Z,varargin) +%ADDED FEATURE +true=logical(1); +false=logical(0); + +%DENDROGRAM Generate dendrogram plot. +% DENDROGRAM(Z) generates a dendrogram plot of the hierarchical +% binary cluster tree Z. Z is an (M-1)-by-3 matrix, generated +% by the LINKAGE function, where M is the number of objects in the +% original dataset. +% +% A dendrogram consists of many U-shaped lines connecting objects +% in a hierarchical tree. Except for the Ward linkage (see LINKAGE), +% the height of each U represents the distance between the two +% objects being connected. +% +% DENDROGRAM(Z,P) generates a dendrogram with only the top P nodes. +% By default, DENDROGRAM uses 30 as the value of P. When there are +% more than 30 initial nodes, a dendrogram may look crowded. To +% display every node, set P = 0. +% +% H = DENDROGRAM(...) returns a vector of line handles. +% +% [H,T] = DENDROGRAM(...) generates a dendrogram and returns T, a vector of +% size M that contains the leaf node number for each object in the original +% dataset. T is useful when P is less than the total number of objects, so +% some leaf nodes in the display correspond to multiple objects. For +% example, to find out which objects are contained in leaf node k of the +% dendrogram, use find(T==k). When there are fewer than P objects in the +% original data, all objects are displayed in the dendrogram. In this case, +% T is the identity map, i.e., T = (1:M)', where each node contains only itself. +% +% [H,T,PERM] = DENDROGRAM(...) generates a dendrogram and returns +% the permutation vector of the node labels of the leaves of the +% dendrogram. PERM is ordered from left to right on a horizontal dendrogram +% and bottom to top for a vertical dendrogram. +% +% H = DENDROGRAM(...,'COLORTHRESHOLD',T) assigns a unique color +% to each group of nodes within the dendrogram whose linkage is less than +% the scalar value T where T is in the range 0 < T < max(Z(:,3)). If T is +% less than or equal to zero or if T is greater than the maximum linkage then +% the dendrogram will be drawn using only one color. T can also be set to +% 'default' in which case the threshold will be set to 70% of the maximum +% linkage i.e. 0.7 * max(Z(:,3)). +% +% H = DENDROGRAM(...,'ORIENTATION',ORIENT) will orient the dendrogram +% within the figure window. Options are: +% +% 'top' --- top to bottom (default) +% 'bottom' --- bottom to top +% 'left' --- left to right +% 'right' --- right to left +% +% Example: +% +% rand('seed',12); +% X = rand(100,2); +% Y = pdist(X,'cityblock'); +% Z = linkage(Y,'average'); +% [H, T] = dendrogram(Z); +% +% See also LINKAGE, PDIST, CLUSTER, CLUSTERDATA, INCONSISTENT. + +% Copyright 1993-2002 The MathWorks, Inc. +% $Revision: 1.15 $ + + +m = size(Z,1)+1; +if nargin < 2 + p = 30; +end + +if nargin == 2 + p = varargin{1}; +end + +orientation = 'd'; +horz = false; +color = false; +threshold = 0.7 * max(Z(:,3)); + +if nargin > 2 + if isnumeric(varargin{1}) + p = varargin{1}; + offset = 1; + else + p = 30; + offset = 0; + end + + if rem(nargin - offset,2)== 0 + error('Incorrect number of arguments to DENDROGRAM.'); + end + okargs = strvcat('orientation','colorthreshold'); + for j=(1 + offset):2:nargin-2 + pname = varargin{j}; + pval = varargin{j+1}; + k = strmatch(lower(pname), okargs); + if isempty(k) + error(sprintf('Unknown parameter name: %s.',pname)); + elseif length(k)>1 + error(sprintf('Ambiguous parameter name: %s.',pname)); + else + switch(k) + case 1 % orientation + if ~isempty(pval) & ischar(pval) + orientation = lower(pval(1)); + end + if ~ismember(orientation,{'t','b','d','r','l'}) + orientation = 'd'; + warning('Unknown orientation specified, using ''top''.'); + end + if ismember(orientation,{'r','l'}) + horz = true; + end + case 2 % colorthreshold + color = true; + if ischar(pval) + if ~strmatch(lower(pval),'default') + warning('Unknown threshold specified, using default'); + end + end + if isnumeric(pval) + threshold = pval; + end + end + end + end +end +Z = transz(Z); % convert from m+k indexing to min(i,j) indexing. +T = (1:m)'; + +% if there are more than p node, dendrogram looks crowded, the following code +% will make the last p link nodes as the leaf node. +if (m > p) & (p ~= 0) + + Y = Z((m-p+1):end,:); + + R = Y(:,1:2); + R = unique(R(:)); + Rlp = R(R<=p); + Rgp = R(R>p); + W(Rlp) = Rlp; + W(Rgp) = setdiff(1:p, Rlp); + W = W'; + T(R) = W(R); + + % computer all the leaf that each node (in the last 30 row) has + for i = 1:p + c = R(i); + T = clusternum(Z,T,W(c),c,m-p+1,0); % assign to it's leaves. + end + + + Y(:,1) = W(Y(:,1)); + Y(:,2) = W(Y(:,2)); + Z = Y; + + m = p; % reset the number of node to be 30 (row number = 29). +end + +A = zeros(4,m-1); +B = A; +n = m; +X = 1:n; +Y = zeros(n,1); +r = Y; + +% arrange Z into W so that there will be no crossing in the dendrogram. +W = zeros(size(Z)); +W(1,:) = Z(1,:); + +nsw = zeros(n,1); rsw = nsw; +nsw(Z(1,1:2)) = 1; rsw(1) = 1; +k = 2; s = 2; + +while (k < n) + i = s; + while rsw(i) | ~any(nsw(Z(i,1:2))) + if rsw(i) & i == s, s = s+1; end + i = i+1; + end + + W(k,:) = Z(i,:); + nsw(Z(i,1:2)) = 1; + rsw(i) = 1; + if s == i, s = s+1; end + k = k+1; +end + +g = 1; +for k = 1:m-1 % initialize X + i = W(k,1); + if ~r(i), + X(i) = g; + g = g+1; + r(i) = 1; + end + i = W(k,2); + if ~r(i), + X(i) = g; + g = g+1; + r(i) = 1; + end +end +[u,v]=sort(X); +% v is the third output value (PERM) +label = num2str(v'); + +% set up the color + +numColors = 1;theGroups = 1; +groups = 0; +cmap = [0 0 1]; + +if color + groups = sum(Z(:,3)< threshold); + if groups > 1 & groups < (m-1) + theGroups = zeros(m-1,1); + numColors = 0; + for count = groups:-1:1 + if (theGroups(count) == 0) + P = zeros(m-1,1); + P(count) = 1; + P = colorcluster(Z,P,Z(count,1),count); + P = colorcluster(Z,P,Z(count,2),count); + numColors = numColors + 1; + theGroups(logical(P)) = numColors; + end + end + cmap = hsv(numColors); + cmap(end+1,:) = [0 0 0]; + else + groups = 1; + end + +end + +% ---------------------------------------------------------- +% if isempty(get(0,'CurrentFigure')) | ishold +% figure; +% set(gcf,'Position', [50, 50, 800, 500]); +% else +% newplot; +% end +% +% col = zeros(m-1,3); +% h = zeros(m-1,1); +% +% for n = 1:(m-1) +% i = Z(n,1); j = Z(n,2); w = Z(n,3); +% A(:,n) = [X(i) X(i) X(j) X(j)]'; +% B(:,n) = [Y(i) w w Y(j)]'; +% X(i) = (X(i)+X(j))/2; Y(i) = w; +% if n <= groups +% col(n,:) = cmap(theGroups(n),:); +% else +% col(n,:) = cmap(end,:); +% end +% end +% +% +% ymin = min(Z(:,3)); +% ymax = max(Z(:,3)); +% margin = (ymax - ymin) * 0.05; +% n = size(label,1); +% +% if(~horz) +% for count = 1:(m-1) +% h(count) = line(A(:,count),B(:,count),'color',col(count,:)); +% end +% lims = [0 m+1 max(0,ymin-margin) (ymax+margin)]; +% set(gca, 'Xlim', [.5 ,(n +.5)], 'XTick', 1:n, 'XTickLabel', label, ... +% 'Box', 'off'); +% mask = logical([0 0 1 1]); +% if strcmp(orientation,'b') +% set(gca,'XAxisLocation','top','Ydir','reverse'); +% end +% else +% for count = 1:(m-1) +% h(count) = line(B(:,count),A(:,count),'color',col(count,:)); +% end +% lims = [max(0,ymin-margin) (ymax+margin) 0 m+1 ]; +% set(gca, 'Ylim', [.5 ,(n +.5)], 'YTick', 1:n, 'YTickLabel', label, ... +% 'Box', 'off'); +% mask = logical([1 1 0 0]); +% if strcmp(orientation, 'l') +% set(gca,'YAxisLocation','right','Xdir','reverse'); +% end +% end +% +% if margin==0 +% if ymax~=0 +% lims(mask) = ymax * [0 1.25]; +% else +% lims(mask) = [0 1]; +% end +% end +% axis(lims); + +% ----------------------------------------------------------- +function T = clusternum(X, T, c, k, m, d) +% assign leaves under cluster c to c. + +d = d+1; +n = m; flag = 0; +while n > 1 + n = n-1; + if X(n,1) == k % node k is not a leave, it has subtrees + T = clusternum(X, T, c, k, n,d); % trace back left subtree + T = clusternum(X, T, c, X(n,2), n,d); + flag = 1; break; + end +end + +n = size(X,1); +if flag == 0 & d ~= 1 % row m is leaf node. + T(X(m,1)) = c; + T(X(m,2)) = c; +end +% --------------------------------------- +function T = colorcluster(X, T, k, m) +% find local clustering + +n = m; +while n > 1 + n = n-1; + if X(n,1) == k % node k is not a leave, it has subtrees + T = colorcluster(X, T, k, n); % trace back left subtree + T = colorcluster(X, T, X(n,2), n); + break; + end +end +T(m) = 1; +% --------------------------------------- +function Z = transz(Z) +%TRANSZ Translate output of LINKAGE into another format. +% This is a helper function used by DENDROGRAM and COPHENET. + +% In LINKAGE, when a new cluster is formed from cluster i & j, it is +% easier for the latter computation to name the newly formed cluster +% min(i,j). However, this definition makes it hard to understand +% the linkage information. We choose to give the newly formed +% cluster a cluster index M+k, where M is the number of original +% observation, and k means that this new cluster is the kth cluster +% to be formmed. This helper function converts the M+k indexing into +% min(i,j) indexing. + +m = size(Z,1)+1; + +for i = 1:(m-1) + if Z(i,1) > m + Z(i,1) = traceback(Z,Z(i,1)); + end + if Z(i,2) > m + Z(i,2) = traceback(Z,Z(i,2)); + end + if Z(i,1) > Z(i,2) + Z(i,1:2) = Z(i,[2 1]); + end +end + + +function a = traceback(Z,b) + +m = size(Z,1)+1; + +if Z(b-m,1) > m + a = traceback(Z,Z(b-m,1)); +else + a = Z(b-m,1); +end +if Z(b-m,2) > m + c = traceback(Z,Z(b-m,2)); +else + c = Z(b-m,2); +end + +a = min(a,c); diff --git a/matlab/linkage/encodealn.m b/matlab/linkage/encodealn.m new file mode 100644 index 0000000..6d4a1be --- /dev/null +++ b/matlab/linkage/encodealn.m @@ -0,0 +1,31 @@ +function [aln2] = encodealn(aln) +%ENCODEALN - Convert nucleotide in alignment to integer. +% +% Syntax: [aln2] = encodealn(aln) +% +% Inputs: +% aln - Alignment structure letter representation +% +% Outputs: +% aln2 - Alignment structure integer representation +% +% +% See also: CODONISESEQ, ENCODESEQ + +% Molecular Biology & Evolution Toolbox, (C) 2005 +% Author: James J. Cai +% Email: jamescai@hkusua.hku.hk +% Website: http://web.hku.hk/~jamescai/ +% Last revision: 5/28/2005 + +if ~(aln.seqtype) error('Do not know the type of sequence! ... BAPS'); end +aln2=aln; + +switch (aln2.seqtype) + case (1) + aln2.seq = i_encode_n(aln2.seq); + case (2) + aln2.seq = i_encode_n(aln2.seq); + case (3) + aln2.seq = i_encode_a(aln2.seq); +end \ No newline at end of file diff --git a/matlab/linkage/i_encode_n.m b/matlab/linkage/i_encode_n.m new file mode 100644 index 0000000..cd3e619 --- /dev/null +++ b/matlab/linkage/i_encode_n.m @@ -0,0 +1,61 @@ +function S=i_encode_n(Seq) +%I_ENCODE_N - Convert a nucleotide sequence from a letter to an integer representation +%Internal function encodes the nucleotide sequences by digits +% +% Syntax: S=i_encode_n(Seq) +% +% Inputs: +% Seq - Letter representation of sequence +% +% Outputs: +% S - Integer representation of sequence +% +% +% See also: I_ENCODE_A + +% Molecular Biology & Evolution Toolbox, (C) 2005 +% Author: James J. Cai +% Email: jamescai@hkusua.hku.hk +% Website: http://web.hku.hk/~jamescai/ +% Last revision: 5/28/2005 +% seqcode = 'ACGTDI?-' +method=1; +Seq=upper(Seq); % Read lower case letters +[NT,AA] = seqcode; % NT = 'ACGTDI?-' +switch (method) + case (1) + [n,m]=size(Seq); + S = ones(n,m).*8; + Seq(find(Seq=='U'))='T'; %replace U with T + for (k=1:8), + S(find(Seq==NT(k)))=k; + end + S(find(Seq==NT(7))) = -999; % missing data denoted as -999. + S(find(Seq==NT(8))) = -999; % gap + case (2) + [n,m]=size(Seq); + S = zeros(n,m); i = 1; j = 1; + + code = zeros(256,1); + for o = 1:256 + code(o) = nan; + end + + % NT = 'ACGT-'; + + for o = 1:5 + code(abs(NT(o))) = o; + end + + for i=1:n + for j=1:m + if Seq(i,j) == 'U' + S(i,j) = code(abs('T')); + else + S(i,j) = code(abs(Seq(i,j))); + end; + end; + end; +end + +% S=uint8(S); \ No newline at end of file diff --git a/matlab/linkage/linkageMix.m b/matlab/linkage/linkageMix.m new file mode 100644 index 0000000..f552d48 --- /dev/null +++ b/matlab/linkage/linkageMix.m @@ -0,0 +1,1277 @@ +function [logml, npops, partitionSummary] = linkageMix(c,npopsTable) +% Greedy search algorithm with unknown number of classes for linkage +% clustering. + +global POP_LOGML; global PARTITION; +global CQ_COUNTS; global SP_COUNTS; %These counts are for populations +global CQ_SUMCOUNTS; global SP_SUMCOUNTS; %not for individuals +global LOGDIFF; +clearGlobalVars; + +noalle = c.noalle; +adjprior = c.adjprior; %priorTerm = c.priorTerm; +rowsFromInd = c.rowsFromInd; +counts_cq = c.counts_cq; adjprior_cq = c.adjprior_cq; +counts_sp = c.counts_sp; adjprior_sp = c.adjprior_sp; + +if isfield(c,'dist') + dist = c.dist; Z = c.Z; +end + +clear c; + +ninds = size(counts_cq,3); + +if nargin < 2 + + npopstext = []; + ready = false; + teksti = 'Input upper bound to the number of populations (possibly multiple values): '; + while ready == false + + if ninds>20 + default = 20; + else + default = floor(ninds/2); + end + + npopstextExtra = inputdlg(teksti ,... + 'Input maximum number of populations',1,{num2str(default)}); + % ------------------------------------------------------------------- + drawnow + if isempty(npopstextExtra) % cancel has been pressed + dispCancel + return + end + npopstextExtra = npopstextExtra{1}; + if length(npopstextExtra)>=255 + npopstextExtra = npopstextExtra(1:255); + npopstext = [npopstext ' ' npopstextExtra]; + teksti = 'The input field length limit (255 characters) was reached. Input more values: '; + else + % ----------------------------------------------------- + % Set the limit of the input value. + % Modified by Jing Tang, 30.12.2005 + if str2num(npopstextExtra) > ninds + teksti = 'Values larger than the sample size are not accepted. Input new value: '; + else + npopstext = [npopstext ' ' npopstextExtra]; + ready = true; + end + end + end + clear ready; clear teksti; + if isempty(npopstext) || length(npopstext)==1 + return + else + npopsTable = str2num(npopstext); + % ykkoset = find(npopsTable==1); + npopsTable(logical(npopsTable==1)) = []; + if isempty(npopsTable) + logml = 1; partitionSummary=1; npops=1; + return + end + % clear ykkoset; + end +end + +nruns = length(npopsTable); + +logmlBest = -1e50; +partitionSummary = -1e50*ones(100,2); % 100 best partitions (npops and logml) +partitionSummary(:,1) = zeros(100,1); +worstLogml = -1e50; worstIndex = 1; + +for run = 1:nruns + npops = npopsTable(run); + dispLine; + disp(['Run ' num2str(run) '/' num2str(nruns) ... + ', maximum number of populations ' num2str(npops) '.']); + + initialPartition = admixture_initialization(npops, Z); + PARTITION = initialPartition; + [cq_counts, cq_sumcounts] = initialCounts(counts_cq); + % clear counts_cq; + CQ_COUNTS = cq_counts; clear cq_counts; + CQ_SUMCOUNTS = cq_sumcounts; clear cq_sumcounts; + [sp_counts, sp_sumcounts] = initialCounts(counts_sp); + % clear counts_sp; + SP_COUNTS = sp_counts; clear sp_counts; + SP_SUMCOUNTS = sp_sumcounts; clear sp_sumcounts; + + logml = computeLogml(adjprior_cq, adjprior_sp); + POP_LOGML = computePopulationLogml(1:npops,adjprior_cq, adjprior_sp); + + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + + %%%% + % Finding the best partition with the greedy search algorithm + %%%% + nRoundTypes = 7; + tested = zeros(nRoundTypes,1); + roundTypes = [1 1]; + ready = 0; phase = 1; + ninds = length(PARTITION); % number of individuals + LOGDIFF = repmat(-Inf,ninds,npops); + + disp(' '); + disp(['Mixture analysis started with initial ' num2str(npops) ' populations.']); + + while ready ~= 1 + changesMade = 0; + + disp(['Performing steps: ' num2str(roundTypes)]); + + for n = 1:length(roundTypes) + round = roundTypes(n); + % pack; + if tested(round) == 1 + + elseif round==1 % Moving one individual to another population + inds = randperm(ninds); % random order + changesMadeNow = 0; + for ind = inds + i1 = PARTITION(ind); + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + changesInLogml = computeChanges(ind, adjprior_cq, ... + adjprior_sp, indCqCounts, indSpCounts); + + [maxChange, i2] = max(changesInLogml); + + + if (i1~=i2 && maxChange>1e-5) + % Individual is moved + changesMade = 1; + if changesMadeNow == 0 + disp('action 1'); + changesMadeNow = 1; + tested = zeros(nRoundTypes,1); + end + updateGlobalVariables(ind, i2, indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + logml = logml+maxChange; + + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + end + end + + if changesMadeNow == 0 + tested(round) = 1; + end + + elseif round==2 % Combining two populationgs + maxChange = 0; + for pop = 1:npops + changesInLogml = computeChanges2(pop, adjprior_cq, adjprior_sp); + [biggest, index] = max(changesInLogml); + if biggest>maxChange + maxChange = biggest; + i1 = pop; + i2 = index; + end + end + + if maxChange>1e-5 + disp('action 2'); + changesMade = 1; + tested = zeros(nRoundTypes,1); + updateGlobalVariables2(i1, i2, adjprior_cq, adjprior_sp); + logml = logml + maxChange; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + else + tested(round) = 1; + end + + + elseif round==3 || round==4 % Splitting population into smaller groups + maxChange = 0; + + for pop = 1:npops + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2>5 + % Computing the distance between individuals inds2 + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = computeLinkage(dist2'); + + + % Number of groups: + if round==3 + npops2 = min(20, floor(ninds2 / 5)); + elseif round==4 + npops2 = 2; + end + T2 = cluster_own(Z2, npops2); + + changesInLogml = computeChanges3(T2, inds2, pop, ... + counts_cq, counts_sp, adjprior_cq, adjprior_sp); + [biggest, index] = max(changesInLogml(1:end)); + if biggest > maxChange + maxChange = biggest; + movingGroup = rem(index,npops2); % The group, which is moved + if movingGroup==0, movingGroup = npops2; end + movingInds = inds2(logical(T2==movingGroup)); + i2 = ceil(index/npops2); % pop where movingGroup would be moved + end + end + end + if maxChange>1e-5 + changesMade = 1; + tested = zeros(nRoundTypes,1); + if round==3 + disp('action 3'); + else + disp('action 4'); + end + indCqCounts = uint16(sum(counts_cq(:,:,movingInds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,movingInds),3)); + updateGlobalVariables3(movingInds, i2,indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + logml = logml + maxChange; + + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + else + tested(round) = 1; + end + + elseif round == 5 || round == 6 + %Moving individuals out of population until positive change + %in logml has occured + pop=0; + changesMadeNow = 0; + %Saving old values + poplogml = POP_LOGML; + partition = PARTITION; + cq_counts = CQ_COUNTS; + sp_counts = SP_COUNTS; + cq_sumcounts = CQ_SUMCOUNTS; + sp_sumcounts = SP_SUMCOUNTS; + logdiff = LOGDIFF; + + while (pop < npops && changesMadeNow == 0) + pop = pop+1; + totalChangeInLogml = 0; + inds = find(PARTITION==pop); + if round == 5 + %Random order + aputaulu = [inds rand(length(inds),1)]; + aputaulu = sortrows(aputaulu,2); + inds = aputaulu(:,1)'; + elseif round == 6 + inds = returnInOrder(inds, pop, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp); + end + + i=0; + + while (length(inds)>0 && i 1e-5 + i=length(inds); + end + end + end + + if totalChangeInLogml>1e-5 + if round == 5 + disp('action 5'); + elseif round == 6 + disp('action 6'); + end + tested = zeros(nRoundTypes,1); + changesMadeNow=1; + changesMade = 1; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + else + % No better partition was found, restoring the old + % values + PARTITION = partition; + POP_LOGML = poplogml; + CQ_COUNTS = cq_counts; + SP_COUNTS = sp_counts; + CQ_SUMCOUNTS = cq_sumcounts; + SP_SUMCOUNTS = sp_sumcounts; + LOGDIFF = logdiff; + logml = logml - totalChangeInLogml; + end + end + clear partition; clear poplogml; + if changesMadeNow == 0 + tested(round) = 1; + end + + elseif round == 7 + emptyPop = findEmptyPop(npops); + j = 0; + pops = randperm(npops); + % totalChangeInLogml = 0; + if emptyPop == -1 + j = npops; + end + changesMadeNow = 0; + while (j < npops) + j = j +1; + pop = pops(j); + inds2 = find(PARTITION == pop); + ninds2 = length(inds2); + if ninds2 > 5 + partition = PARTITION; + cq_sumcounts = CQ_SUMCOUNTS; + cq_counts = CQ_COUNTS; + sp_sumcounts = SP_SUMCOUNTS; + sp_counts = SP_COUNTS; + poplogml = POP_LOGML; + logdiff = LOGDIFF; + % pack; + + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = computeLinkage(dist2'); + T2 = cluster_own(Z2, 2); + % movingInds = inds2(find(T2 == 1)); + movingInds = inds2(logical(T2 == 1)); + changesInLogml = computeChanges3(T2, inds2, pop, ... + counts_cq, counts_sp, adjprior_cq, adjprior_sp); + totalChangeInLogml = changesInLogml(1, emptyPop); + + indCqCounts = uint16(sum(counts_cq(:,:,movingInds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,movingInds),3)); + updateGlobalVariables3(movingInds, emptyPop,indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + + changed = 1; + + while (changed == 1) + changed = 0; + + changesInLogml = computeChanges5(inds2, pop, emptyPop, ... + counts_cq, counts_sp, adjprior_cq, adjprior_sp); + [maxChange, index] = max(changesInLogml); + moving = inds2(index); + if (PARTITION(moving) == pop) + i2 = emptyPop; + else + i2 = pop; + end + + if maxChange > 1e-5 + indCqCounts = uint16(counts_cq(:,:,moving)); + indSpCounts = uint16(counts_sp(:,:,moving)); + updateGlobalVariables3(moving, i2,indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + changed = 1; + totalChangeInLogml = totalChangeInLogml + maxChange; + end + end + + if totalChangeInLogml > 1e-5 + changesMade = 1; + changesMadeNow = 1; + logml = logml + totalChangeInLogml; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + disp('action 7'); + tested = zeros(nRoundTypes, 1); + j = npops; + else + % No better partition was found, restoring the old + % values + PARTITION = partition; + POP_LOGML = poplogml; + CQ_COUNTS = cq_counts; + SP_COUNTS = sp_counts; + CQ_SUMCOUNTS = cq_sumcounts; + SP_SUMCOUNTS = sp_sumcounts; + LOGDIFF = logdiff; + %logml = logml - totalChangeInLogml; + end + end + end + if changesMadeNow == 0 + tested(round) = 1; + end + end + + end + + + if changesMade == 0 + if phase==1 + phase = 2; + elseif phase==2 + phase = 3; + elseif phase==3 + phase = 4; + elseif phase==4; + phase = 5; + elseif phase==5 + ready = 1; + end + else + changesMade = 0; + end + + if ready==0 + if phase==1 + roundTypes=[1]; + elseif phase==2 + roundTypes=[2]; + elseif phase==3 + roundTypes=[5 5 7]; + elseif phase==4 + roundTypes=[4 3 1 1]; + elseif phase==5 + roundTypes=[6 2 7 3 4 1]; + end + end + + end + % Saving results + + npops = removeEmptyPops; + POP_LOGML = computePopulationLogml(1:npops, adjprior_cq, adjprior_sp); + + disp(['Found partition with ' num2str(npops) ' populations.']); + disp(['Log(ml) = ' num2str(logml)]); + disp(' '); + + if logml>logmlBest + % Updating the best found partition + logmlBest = logml; + npopsBest = npops; + partitionBest = PARTITION; + cq_countsBest = CQ_COUNTS; + sp_countsBest = SP_COUNTS; + cq_sumcountsBest = CQ_SUMCOUNTS; + sp_sumcountsBest = SP_SUMCOUNTS; + pop_logmlBest = POP_LOGML; + logdiffbest = LOGDIFF; + end +end + +logml = logmlBest; +npops = npopsBest; +PARTITION = partitionBest; +CQ_COUNTS = cq_countsBest; +SP_COUNTS = sp_countsBest; +CQ_SUMCOUNTS = cq_sumcountsBest; +SP_SUMCOUNTS = sp_sumcountsBest; +POP_LOGML = pop_logmlBest; +LOGDIFF = logdiffbest; + +%-------------------------------------------------------------------------- +% The next three functions are for computing the initial partition +% according to the distance between the individuals + +function initial_partition=admixture_initialization(nclusters,Z) +T=cluster_own(Z,nclusters); +initial_partition=T; + +%-------------------------------------------------------------------------- +function T = cluster_own(Z,nclust) +% true=logical(1); +% false=logical(0); + +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); +% maximum number of clusters based on inconsistency +if m <= maxclust + T = (1:m)'; +elseif maxclust==1 + T = ones(m,1); +else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end +end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + +%-------------------------------------------------------------------------- + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäärä. + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + +%-------------------------------------------------------------------------- + +function Z = computeLinkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 || m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +% monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + +%-------------------------------------------------------------------------- + +function changes = computeChanges(ind, adjprior_cq, adjprior_sp, ... + indCqCounts, indSpCounts) +% Computes changes in log-marginal likelihood if individual ind is +% moved to another population +% +% Input: +% ind - the individual to be moved +% adjprior_cq & _sp - adjpriors for cliques and separators +% indCqCounts, indSpCounts - counts for individual ind +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% ind is move to population i. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +npops = size(CQ_COUNTS,3); +changes = LOGDIFF(ind,:); + +i1 = PARTITION(ind); +i1_logml = POP_LOGML(i1); +changes(i1) = 0; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +new_i1_logml = computePopulationLogml(i1, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)+indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)+sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)+indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)+sumSp; + +i2 = find(changes==-Inf); +i2 = setdiff(i2,i1); +i2_logml = POP_LOGML(i2); + +ni2 = length(i2); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 ni2]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[ni2 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 ni2]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:) + repmat(sumSp,[ni2 1]); + +new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 ni2]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[ni2 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 ni2]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:) - repmat(sumSp,[ni2 1]); +% a = repmat(sumSp,[npops-1 1]); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; +LOGDIFF(ind,:) = changes; + +%------------------------------------------------------------------------------------ + +function changes = computeChanges2(i1, adjprior_cq, adjprior_sp) +% Computes changes in log marginal likelihood if population i1 is combined +% with another population +% +% Input: +% i1 - the population to be combined +% adjprior_cq & _sp - adjpriors for cliques and separators +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% i1 is combined with population i. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; +npops = size(CQ_COUNTS,3); +changes = zeros(npops,1); + +i1_logml = POP_LOGML(i1); +indCqCounts = CQ_COUNTS(:,:,i1); +indSpCounts = SP_COUNTS(:,:,i1); +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +new_i1_logml = 0; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ repmat(sumSp,[npops-1 1]); +% a = repmat(sumSp,[npops-1 1]); +% if ~any(sumSp) +% a(:,[1:size(a,2)])=[]; +% end +% SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ a ; + + +new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)- repmat(sumSp,[npops-1 1]); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + + + +%------------------------------------------------------------------------------------ + + +function changes = computeChanges3(T2, inds2, i1, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Computes changes in log marginal likelihood if subpopulation of i2 is +% moved to another population +% +% Input: +% T2 - partition of inds2 to subpopulations +% inds2 - individuals in population i1 +% i2 +% counts_cq, counts_sp - counts for individuals +% +% Output: +% changes - table of size length(unique(T2))*npops. +% changes(i,j) = difference in logml if subpopulation inds2(find(T2==i)) of +% i2 is moved to population j + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; +npops = size(CQ_COUNTS,3); +npops2 = length(unique(T2)); +changes = zeros(npops2,npops); + +%cq_counts = CQ_COUNTS; +%sp_counts = SP_COUNTS; +%cq_sumcounts = CQ_SUMCOUNTS; +%sp_sumcounts = SP_SUMCOUNTS; + + +i1_logml = POP_LOGML(i1); + +for pop2 = 1:npops2 + % inds = inds2(find(T2==pop2)); + inds = inds2(logical(T2==pop2)); + ninds = length(inds); + if ninds>0 + indCqCounts = uint16(sum(counts_cq(:,:,inds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,inds),3)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; + CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; + SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + + new_i1_logml = computePopulationLogml(i1, adjprior_cq, adjprior_sp); + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)+indCqCounts; + CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)+sumCq; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)+indSpCounts; + SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)+sumSp; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); + CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); + SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ repmat(sumSp,[npops-1 1]); + + new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp)'; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); + CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); + SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)- repmat(sumSp,[npops-1 1]); + + changes(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + +%-------------------------------------------------------------------------- + +function changes = computeChanges5(inds, i1, i2, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Computes change in logml if individual of inds is moved between +% populations i1 and i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; global PARTITION; + +ninds = length(inds); +changes = zeros(ninds,1); + +i1_logml = POP_LOGML(i1); +i2_logml = POP_LOGML(i2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %from + pop2 = i2; %to + else + pop1 = i2; + pop2 = i1; + end + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,pop1) = CQ_COUNTS(:,:,pop1)-indCqCounts; + CQ_SUMCOUNTS(pop1,:) = CQ_SUMCOUNTS(pop1,:)-sumCq; + SP_COUNTS(:,:,pop1) = SP_COUNTS(:,:,pop1)-indSpCounts; + SP_SUMCOUNTS(pop1,:) = SP_SUMCOUNTS(pop1,:) - sumSp; + + CQ_COUNTS(:,:,pop2) = CQ_COUNTS(:,:,pop2)+indCqCounts; + CQ_SUMCOUNTS(pop2,:) = CQ_SUMCOUNTS(pop2,:)+sumCq; + SP_COUNTS(:,:,pop2) = SP_COUNTS(:,:,pop2)+indSpCounts; + SP_SUMCOUNTS(pop2,:) = SP_SUMCOUNTS(pop2,:) + sumSp; + + new_logmls = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + changes(i) = sum(new_logmls); + + CQ_COUNTS(:,:,pop1) = CQ_COUNTS(:,:,pop1)+indCqCounts; + CQ_SUMCOUNTS(pop1,:) = CQ_SUMCOUNTS(pop1,:)+sumCq; + SP_COUNTS(:,:,pop1) = SP_COUNTS(:,:,pop1)+indSpCounts; + SP_SUMCOUNTS(pop1,:) = SP_SUMCOUNTS(pop1,:)+sumSp; + CQ_COUNTS(:,:,pop2) = CQ_COUNTS(:,:,pop2)-indCqCounts; + CQ_SUMCOUNTS(pop2,:) = CQ_SUMCOUNTS(pop2,:)-sumCq; + SP_COUNTS(:,:,pop2) = SP_COUNTS(:,:,pop2)-indSpCounts; + SP_SUMCOUNTS(pop2,:) = SP_SUMCOUNTS(pop2,:)-sumSp; +end + +changes = changes - i1_logml - i2_logml; + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, indCqCounts, indSpCounts, ... + adjprior_cq, adjprior_sp) +% Updates global variables when individual ind is moved to population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+indCqCounts; +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+sumCq; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+indSpCounts; +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+sumSp; + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2(i1, i2, adjprior_cq, adjprior_sp) +% Updates global variables when all individuals from population i1 are moved +% to population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +% inds = find(PARTITION==i1); +% PARTITION(inds) = i2; +PARTITION(logical(PARTITION==i1)) = i2; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+CQ_COUNTS(:,:,i1); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+CQ_SUMCOUNTS(i1,:); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+SP_COUNTS(:,:,i1); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+SP_SUMCOUNTS(i1,:); + +CQ_COUNTS(:,:,i1) = 0; +CQ_SUMCOUNTS(i1,:) = 0; +SP_COUNTS(:,:,i1) = 0; +SP_SUMCOUNTS(i1,:) = 0; + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, i2, indCqCounts, indSpCounts, ... + adjprior_cq, adjprior_sp) +% Updates global variables when individuals muuttuvat are moved to +% population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+indCqCounts; +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+sumCq; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+indSpCounts; +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+sumSp; + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Returns individuals inds in order according to the change in the logml if +% they are moved out of the population pop + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind = inds(i); + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,pop) = CQ_COUNTS(:,:,pop)-indCqCounts; + CQ_SUMCOUNTS(pop,:) = CQ_SUMCOUNTS(pop,:)-sumCq; + SP_COUNTS(:,:,pop) = SP_COUNTS(:,:,pop)-indSpCounts; + SP_SUMCOUNTS(pop,:) = SP_SUMCOUNTS(pop,:)-sumSp; + + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior_cq, adjprior_sp); + + CQ_COUNTS(:,:,pop) = CQ_COUNTS(:,:,pop)+indCqCounts; + CQ_SUMCOUNTS(pop,:) = CQ_SUMCOUNTS(pop,:)+sumCq; + SP_COUNTS(:,:,pop) = SP_COUNTS(:,:,pop)+indSpCounts; + SP_SUMCOUNTS(pop,:) = SP_SUMCOUNTS(pop,:)+sumSp; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + + +%-------------------------------------------------------------------------- + +function clearGlobalVars + +global CQ_COUNTS; CQ_COUNTS = []; +global CQ_SUMCOUNTS; CQ_SUMCOUNTS = []; +global SP_COUNTS; SP_COUNTS = []; +global SP_SUMCOUNTS; SP_SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; +global LOGDIFF; LOGDIFF = []; + +%-------------------------------------------------------------------------- + +function npops = removeEmptyPops +% Removes empty pops from all global COUNTS variables. +% Updates PARTITION and npops + +global CQ_COUNTS; +global CQ_SUMCOUNTS; +global SP_COUNTS; +global SP_SUMCOUNTS; +global PARTITION; +global LOGDIFF; + +notEmpty = find(any(CQ_SUMCOUNTS,2)); +CQ_COUNTS = CQ_COUNTS(:,:,notEmpty); +CQ_SUMCOUNTS = CQ_SUMCOUNTS(notEmpty,:); +SP_COUNTS = SP_COUNTS(:,:,notEmpty); +SP_SUMCOUNTS = SP_SUMCOUNTS(notEmpty,:); +LOGDIFF = LOGDIFF(:,notEmpty); + +for n=1:length(notEmpty) +% apu = find(PARTITION==notEmpty(n)); +% PARTITION(apu)=n; +PARTITION(logical(PARTITION==notEmpty(n))) = n; +end +npops = length(notEmpty); + +%-------------------------------------------------------------------------- + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, että annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ssä ei vielä ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyistä partitiota vastaava nclusters:in arvo. Muutoin ei tehdä mitään. +global PARTITION; +apu = isempty(find(abs(partitionSummary(:,2)-logml)<1e-5,1)); +if apu + % Nyt löydetty partitio ei ole vielä kirjattuna summaryyn. + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + +%-------------------------------------------------------------------------- + +function [counts, sumcounts] = initialCounts(ind_counts) + +global PARTITION; + +pops = unique(PARTITION); +npops = max(pops); + +counts = zeros(size(ind_counts,1), size(ind_counts,2), npops,'uint16'); +sumcounts = zeros(npops, size(ind_counts,2),'uint16'); + +for i = 1:npops + inds = find(PARTITION == i); + counts(:,:,i) = sum(ind_counts(:,:,inds), 3); + sumcounts(i,:) = sum(counts(:,:,i),1); +end + +%-------------------------------------------------------------------------- + +function logml = computeLogml(adjprior_cq, adjprior_sp) + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +cq_counts = double(CQ_COUNTS); +cq_sumcounts = double(CQ_SUMCOUNTS); +sp_counts = double(SP_COUNTS); +sp_sumcounts = double(SP_SUMCOUNTS); + +npops = size(CQ_COUNTS, 3); + +cq_logml = sum(sum(sum(gammaln(cq_counts+repmat(adjprior_cq,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior_cq))) - ... + sum(sum(gammaln(1+cq_sumcounts))); + +sp_logml = sum(sum(sum(gammaln(sp_counts+repmat(adjprior_sp,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior_sp))) - ... + sum(sum(gammaln(1+sp_sumcounts))); + +logml = cq_logml - sp_logml; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%-------------------------------------------------------------------------- + +function popLogml = computePopulationLogml(pops, adjprior_cq, adjprior_sp) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +cq_counts = double(CQ_COUNTS); +cq_sumcounts = double(CQ_SUMCOUNTS); +sp_counts = double(SP_COUNTS); +sp_sumcounts = double(SP_SUMCOUNTS); + +nall_cq = size(CQ_COUNTS,1); +nall_sp = size(SP_COUNTS,1); +ncliq = size(CQ_COUNTS,2); +nsep = size(SP_COUNTS, 2); + +z = length(pops); + +popLogml_cq = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_cq,[1 1 z]) + cq_counts(:,:,pops)) ... + ,[nall_cq ncliq z]),1),2)) - sum(gammaln(1+cq_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_cq))); + +popLogml_sp = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_sp,[1 1 z]) + sp_counts(:,:,pops)) ... + ,[nall_sp nsep z]),1),2)) - sum(gammaln(1+sp_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_sp))); + +popLogml = popLogml_cq - popLogml_sp; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%------------------------------------------------------------------- + + + + +%-------------------------------------------------------------- +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +% newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) && n= 1 + suurinYks = suurinYks+1; + end + if suurinYks<10 + mjono(7) = num2str(suurinYks); + mjono(6) = 'e'; + mjono(5) = palautaYks(abs(logml),suurinYks-1); + mjono(4) = '.'; + mjono(3) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(2) = '-'; + end + elseif suurinYks>=10 + mjono(6:7) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(logml),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(1) = '-'; + end + end +end + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:inä +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmillä +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhjiä +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end + +%-------------------------------------------------------------------------- + + diff --git a/matlab/linkage/linkageMix_fixK.m b/matlab/linkage/linkageMix_fixK.m new file mode 100644 index 0000000..15cff7c --- /dev/null +++ b/matlab/linkage/linkageMix_fixK.m @@ -0,0 +1,1359 @@ +function [logml, npops, partitionSummary] = linkageMix_fixK(c,npops,nruns) +% Greedy search algorithm with fixed number of classes for linkage +% clustering. + +global POP_LOGML; global PARTITION; +global CQ_COUNTS; global SP_COUNTS; %These counts are for populations +global CQ_SUMCOUNTS; global SP_SUMCOUNTS; %not for individuals +global LOGDIFF; +clearGlobalVars; + +noalle = c.noalle; +adjprior = c.adjprior; %priorTerm = c.priorTerm; +rowsFromInd = c.rowsFromInd; +counts_cq = c.counts_cq; adjprior_cq = c.adjprior_cq; +counts_sp = c.counts_sp; adjprior_sp = c.adjprior_sp; + +if isfield(c,'dist') + dist = c.dist; Z = c.Z; +end + +clear c; + +ninds = size(counts_cq,3); + +if nargin < 2 + npopstext = []; + + if ninds>20 + default = 20; + else + default = floor(ninds/2); + end + + teksti = {'Number of populations:', ... + 'Number of runs:'}; + def = {num2str(default), '1'}; + + npopstextExtra = inputdlg(teksti ,... + 'Input parameters for the computation algorithm',1,def); + + if isempty(npopstextExtra) % cancel has been pressed + dispCancel + logml = 1; partitionSummary=1; npops=1; + return + end + npopstext = npopstextExtra{1}; + nrunstext = npopstextExtra{2}; + + clear teksti npopstextExtra; + if isempty(npopstext) + return + else + npopsTable = str2num(npopstext); + npops = npopsTable(1); + + if npops==1 + logml = 1; partitionSummary=1; npops=1; + return + end + nrunsTable = str2num(nrunstext); + nruns = nrunsTable(1); + end +end + +logmlBest = -1e50; +partitionSummary = -1e50*ones(100,2); % 100 best partitions (npops and logml) +partitionSummary(:,1) = zeros(100,1); +worstLogml = -1e50; worstIndex = 1; + +for run = 1:nruns + dispLine; + disp(['Run ' num2str(run) '/' num2str(nruns) ... + ', maximum number of populations ' num2str(npops) '.']); + + initialPartition = admixture_initialization(npops, Z); + PARTITION = initialPartition; + [cq_counts, cq_sumcounts] = initialCounts(counts_cq); + % clear counts_cq; + CQ_COUNTS = cq_counts; clear cq_counts; + CQ_SUMCOUNTS = cq_sumcounts; clear cq_sumcounts; + [sp_counts, sp_sumcounts] = initialCounts(counts_sp); + % clear counts_sp; + SP_COUNTS = sp_counts; clear sp_counts; + SP_SUMCOUNTS = sp_sumcounts; clear sp_sumcounts; + + logml = computeLogml(adjprior_cq, adjprior_sp); + POP_LOGML = computePopulationLogml(1:npops,adjprior_cq, adjprior_sp); + + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + + %%%% + % Finding the best partition with the greedy search algorithm + %%%% + nRoundTypes = 7; + tested = zeros(nRoundTypes,1); + roundTypes = [1 1]; + ready = 0; phase = 1; + ninds = length(PARTITION); % number of individuals + LOGDIFF = repmat(-Inf,ninds,npops); + + disp(' '); + disp(['Mixture analysis started with initial ' num2str(npops) ' populations.']); + + while ready ~= 1 + changesMade = 0; + + disp(['Performing steps: ' num2str(roundTypes)]); + + for n = 1:length(roundTypes) + round = roundTypes(n); + % pack; + if tested(round) == 1 + + elseif round==1 % Moving one individual to another population + inds = randperm(ninds); % random order + changesMadeNow = 0; + for ind = inds + i1 = PARTITION(ind); + if length(find(PARTITION==i1))>1 + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + changesInLogml = computeChanges(ind, adjprior_cq, ... + adjprior_sp, indCqCounts, indSpCounts); + + [maxChange, i2] = max(changesInLogml); + + + if (i1~=i2 && maxChange>1e-5) + % Individual is moved + changesMade = 1; + if changesMadeNow == 0 + disp('Action 1'); + changesMadeNow = 1; + tested = zeros(nRoundTypes,1); + end + updateGlobalVariables(ind, i2, indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + logml = logml+maxChange; + + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + end + end + end + + if changesMadeNow == 0 + tested(round) = 1; + end + + elseif round==2 % Merging two populations and splitting the result + maxChange = -1e50; + poplogml = POP_LOGML; + partition = PARTITION; + cq_counts = CQ_COUNTS; + sp_counts = SP_COUNTS; + cq_sumcounts = CQ_SUMCOUNTS; + sp_sumcounts = SP_SUMCOUNTS; + logdiff = LOGDIFF; + + % Two populations are merged first + + for pop = 1:npops + changesInLogml = computeChanges2(pop, adjprior_cq, adjprior_sp); + changesInLogml(pop)=-1e50; + [biggest, index] = max(changesInLogml); + if biggest>maxChange + maxChange = biggest; + i1 = pop; + i2 = index; + end + end + + totalChange = maxChange; + updateGlobalVariables2(i1, i2, adjprior_cq, adjprior_sp); + + + % A population is split in two + + emptyPop = i1; + maxChange = -1e50; + + for pop = 1:npops + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2>1 + % Computing the distance between individuals inds2 + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = computeLinkage(dist2'); + npops2 = 2; + T2 = cluster_own(Z2, npops2); + + changesInLogml = computeChanges3(T2, inds2, pop, ... + counts_cq, counts_sp, adjprior_cq, adjprior_sp); + biggest = changesInLogml(1,emptyPop); + if biggest > maxChange + maxChange = biggest; + movingInds = inds2(logical(T2==1)); + end + end + end + + indCqCounts = uint16(sum(counts_cq(:,:,movingInds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,movingInds),3)); + updateGlobalVariables3(movingInds, emptyPop,indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + totalChange = totalChange + maxChange; + + % Individuals are moved between populations + inds = randperm(ninds); % random order + for ind = inds + i1 = PARTITION(ind); + if length(find(PARTITION==i1))>1 + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + changesInLogml = computeChanges(ind, adjprior_cq, ... + adjprior_sp, indCqCounts, indSpCounts); + + [maxChange, i2] = max(changesInLogml); + if (i1~=i2 && maxChange>1e-5) + updateGlobalVariables(ind, i2, indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + totalChange = totalChange + maxChange; + end + end + end + + if totalChange > 1e-5 + disp('Action 2'); + logml = logml + totalChange; + tested = zeros(nRoundTypes,1); + + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + + else + PARTITION = partition; + POP_LOGML = poplogml; + CQ_COUNTS = cq_counts; + SP_COUNTS = sp_counts; + CQ_SUMCOUNTS = cq_sumcounts; + SP_SUMCOUNTS = sp_sumcounts; + LOGDIFF = logdiff; + tested(round) = 1; + end + + + elseif round==3 || round==4 % Splitting population into smaller groups + maxChange = 0; + + for pop = 1:npops + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2>5 + % Computing the distance between individuals inds2 + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = computeLinkage(dist2'); + + + % Number of groups: + if round==3 + npops2 = max(min(20, floor(ninds2/2)),2); + elseif round==4 + npops2 = 2; + end + T2 = cluster_own(Z2, npops2); + + changesInLogml = computeChanges3(T2, inds2, pop, ... + counts_cq, counts_sp, adjprior_cq, adjprior_sp); + [biggest, index] = max(changesInLogml(1:end)); + if biggest > maxChange + maxChange = biggest; + movingGroup = rem(index,npops2); % The group, which is moved + if movingGroup==0, movingGroup = npops2; end + movingInds = inds2(logical(T2==movingGroup)); + i2 = ceil(index/npops2); % pop where movingGroup would be moved + end + end + end + if maxChange>1e-5 + changesMade = 1; + tested = zeros(nRoundTypes,1); + if round==3 + disp('Action 3'); + else + disp('Action 4'); + end + indCqCounts = uint16(sum(counts_cq(:,:,movingInds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,movingInds),3)); + updateGlobalVariables3(movingInds, i2,indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + logml = logml + maxChange; + + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + else + tested(round) = 1; + end + + elseif round == 5 || round == 6 + %Moving individuals out of population until positive change + %in logml has occured + pop=0; + changesMadeNow = 0; + %Saving old values + poplogml = POP_LOGML; + partition = PARTITION; + cq_counts = CQ_COUNTS; + sp_counts = SP_COUNTS; + cq_sumcounts = CQ_SUMCOUNTS; + sp_sumcounts = SP_SUMCOUNTS; + logdiff = LOGDIFF; + + while (pop < npops && changesMadeNow == 0) + pop = pop+1; + totalChangeInLogml = 0; + inds = find(PARTITION==pop); + if round == 5 + %Random order + aputaulu = [inds rand(length(inds),1)]; + aputaulu = sortrows(aputaulu,2); + inds = aputaulu(:,1)'; + elseif round == 6 + inds = returnInOrder(inds, pop, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp); + end + + i=0; + + while (length(inds)>0 && i 1e-5 + i=length(inds); + end + end + end + + if totalChangeInLogml>1e-5 + if round == 5 + disp('Action 5'); + elseif round == 6 + disp('Action 6'); + end + tested = zeros(nRoundTypes,1); + changesMadeNow=1; + changesMade = 1; + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + else + % No better partition was found, restoring the old + % values + PARTITION = partition; + POP_LOGML = poplogml; + CQ_COUNTS = cq_counts; + SP_COUNTS = sp_counts; + CQ_SUMCOUNTS = cq_sumcounts; + SP_SUMCOUNTS = sp_sumcounts; + LOGDIFF = logdiff; + logml = logml - totalChangeInLogml; + end + end + clear partition; clear poplogml; + if changesMadeNow == 0 + tested(round) = 1; + end + + elseif round == 7 + emptyPop = npops + 1; + j = 0; + pops = randperm(npops); + changesMadeNow = 0; + while (j < npops) + j = j +1; + pop = pops(j); + inds2 = find(PARTITION == pop); + ninds2 = length(inds2); + if ninds2 > 5 + partition = PARTITION; + cq_sumcounts = CQ_SUMCOUNTS; + cq_counts = CQ_COUNTS; + sp_sumcounts = SP_SUMCOUNTS; + sp_counts = SP_COUNTS; + poplogml = POP_LOGML; + logdiff = LOGDIFF; + + % A new population is created temporarily + npops = npops + 1; + POP_LOGML(npops) = 0; + CQ_COUNTS(:,:,npops) = zeros(size(CQ_COUNTS(:,:,1))); + CQ_SUMCOUNTS(npops,:) = zeros(size(CQ_SUMCOUNTS(1,:))); + SP_COUNTS(:,:,npops) = zeros(size(SP_COUNTS(:,:,1))); + SP_SUMCOUNTS(npops,:) = zeros(size(SP_SUMCOUNTS(1,:))); + + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = computeLinkage(dist2'); + T2 = cluster_own(Z2, 2); + % movingInds = inds2(find(T2 == 1)); + movingInds = inds2(logical(T2 == 1)); + changesInLogml = computeChanges3(T2, inds2, pop, ... + counts_cq, counts_sp, adjprior_cq, adjprior_sp); + totalChangeInLogml = changesInLogml(1, emptyPop); + + indCqCounts = uint16(sum(counts_cq(:,:,movingInds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,movingInds),3)); + updateGlobalVariables3(movingInds, emptyPop,indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + + % Individuals are moved between populations + inds = randperm(ninds); % random order + for ind = inds + i1 = PARTITION(ind); + if length(find(PARTITION==i1))>1 + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + changesInLogml = computeChanges(ind, adjprior_cq, ... + adjprior_sp, indCqCounts, indSpCounts); + + [maxChange, i2] = max(changesInLogml); + if (i1~=i2 && maxChange>1e-5) + updateGlobalVariables(ind, i2, indCqCounts, ... + indSpCounts, adjprior_cq, adjprior_sp); + totalChangeInLogml = totalChangeInLogml + maxChange; + end + end + end + + % Two populations are merged + if length(find(any(CQ_SUMCOUNTS,2))) == npops + maxChange = -1e50; + for pop = 1:npops + changesInLogml = computeChanges2(pop, adjprior_cq, adjprior_sp); + changesInLogml(pop)=-1e50; + [biggest, index] = max(changesInLogml); + if biggest>maxChange + maxChange = biggest; + i1 = pop; + i2 = index; + end + end + + totalChangeInLogml = totalChangeInLogml + maxChange; + updateGlobalVariables2(i1, i2, adjprior_cq, adjprior_sp); + + end + + if totalChangeInLogml > 1e-5 + changesMade = 1; + + logml = logml + totalChangeInLogml; + + npops = removeEmptyPops; % The temporary population is removed + POP_LOGML = computePopulationLogml(1:npops, adjprior_cq, adjprior_sp); + + if logml>worstLogml + [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex); + if (added==1) + [worstLogml, worstIndex] = min(partitionSummary(:,2)); + end + end + if changesMadeNow == 0 + disp('Action 7'); + changesMadeNow = 1; + end + changesMadeNow = 1; + tested = zeros(nRoundTypes, 1); + j = npops; + else + % No better partition was found, restoring the old + % values + PARTITION = partition; + POP_LOGML = poplogml; + CQ_COUNTS = cq_counts; + SP_COUNTS = sp_counts; + CQ_SUMCOUNTS = cq_sumcounts; + SP_SUMCOUNTS = sp_sumcounts; + LOGDIFF = logdiff; + npops = npops-1; + end + end + end + if changesMadeNow == 0 + tested(round) = 1; + end + end + + end + + + if changesMade == 0 + if phase==1 + phase = 2; + elseif phase==2 + phase = 3; + elseif phase==3 + phase = 4; + elseif phase==4; + phase = 5; + elseif phase==5 + ready = 1; + end + else + changesMade = 0; + end + + if ready==0 + if phase==1 + roundTypes=[1]; + elseif phase==2 + roundTypes=[2]; + elseif phase==3 + roundTypes=[5 5 7]; + elseif phase==4 + roundTypes=[4 3 1 1]; + elseif phase==5 + roundTypes=[6 2 7 3 4 1]; + end + end + + end + % Saving results + + npops = removeEmptyPops; + POP_LOGML = computePopulationLogml(1:npops, adjprior_cq, adjprior_sp); + + disp(['Found partition with ' num2str(npops) ' populations.']); + disp(['Log(ml) = ' num2str(logml)]); + disp(' '); + + if logml>logmlBest + % Updating the best found partition + logmlBest = logml; + npopsBest = npops; + partitionBest = PARTITION; + cq_countsBest = CQ_COUNTS; + sp_countsBest = SP_COUNTS; + cq_sumcountsBest = CQ_SUMCOUNTS; + sp_sumcountsBest = SP_SUMCOUNTS; + pop_logmlBest = POP_LOGML; + logdiffbest = LOGDIFF; + end +end + +logml = logmlBest; +npops = npopsBest; +PARTITION = partitionBest; +CQ_COUNTS = cq_countsBest; +SP_COUNTS = sp_countsBest; +CQ_SUMCOUNTS = cq_sumcountsBest; +SP_SUMCOUNTS = sp_sumcountsBest; +POP_LOGML = pop_logmlBest; +LOGDIFF = logdiffbest; + +%-------------------------------------------------------------------------- +% The next three functions are for computing the initial partition +% according to the distance between the individuals + +function initial_partition=admixture_initialization(nclusters,Z) +T=cluster_own(Z,nclusters); +initial_partition=T; + +%-------------------------------------------------------------------------- +function T = cluster_own(Z,nclust) +% true=logical(1); +% false=logical(0); + +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); +% maximum number of clusters based on inconsistency +if m <= maxclust + T = (1:m)'; +elseif maxclust==1 + T = ones(m,1); +else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end +end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + +%-------------------------------------------------------------------------- + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäärä. + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + +%-------------------------------------------------------------------------- + +function Z = computeLinkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 || m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +% monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + +%-------------------------------------------------------------------------- + +function changes = computeChanges(ind, adjprior_cq, adjprior_sp, ... + indCqCounts, indSpCounts) +% Computes changes in log-marginal likelihood if individual ind is +% moved to another population +% +% Input: +% ind - the individual to be moved +% adjprior_cq & _sp - adjpriors for cliques and separators +% indCqCounts, indSpCounts - counts for individual ind +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% ind is move to population i. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +npops = size(CQ_COUNTS,3); +changes = LOGDIFF(ind,:); + +i1 = PARTITION(ind); +i1_logml = POP_LOGML(i1); +changes(i1) = 0; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +new_i1_logml = computePopulationLogml(i1, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)+indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)+sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)+indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)+sumSp; + +i2 = find(changes==-Inf); +i2 = setdiff(i2,i1); +i2_logml = POP_LOGML(i2); + +ni2 = length(i2); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 ni2]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[ni2 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 ni2]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:) + repmat(sumSp,[ni2 1]); + +new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 ni2]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[ni2 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 ni2]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:) - repmat(sumSp,[ni2 1]); +% a = repmat(sumSp,[npops-1 1]); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; +LOGDIFF(ind,:) = changes; + +%------------------------------------------------------------------------------------ + +function changes = computeChanges2(i1, adjprior_cq, adjprior_sp) +% Computes changes in log marginal likelihood if population i1 is combined +% with another population +% +% Input: +% i1 - the population to be combined +% adjprior_cq & _sp - adjpriors for cliques and separators +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% i1 is combined with population i. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; +npops = size(CQ_COUNTS,3); +changes = zeros(npops,1); + +i1_logml = POP_LOGML(i1); +indCqCounts = CQ_COUNTS(:,:,i1); +indSpCounts = SP_COUNTS(:,:,i1); +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +new_i1_logml = 0; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ repmat(sumSp,[npops-1 1]); +% a = repmat(sumSp,[npops-1 1]); +% if ~any(sumSp) +% a(:,[1:size(a,2)])=[]; +% end +% SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ a ; + + +new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)- repmat(sumSp,[npops-1 1]); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + + + +%------------------------------------------------------------------------------------ + + +function changes = computeChanges3(T2, inds2, i1, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Computes changes in log marginal likelihood if subpopulation of i2 is +% moved to another population +% +% Input: +% T2 - partition of inds2 to subpopulations +% inds2 - individuals in population i1 +% i2 +% counts_cq, counts_sp - counts for individuals +% +% Output: +% changes - table of size length(unique(T2))*npops. +% changes(i,j) = difference in logml if subpopulation inds2(find(T2==i)) of +% i2 is moved to population j + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; +npops = size(CQ_COUNTS,3); +npops2 = length(unique(T2)); +changes = zeros(npops2,npops); + +%cq_counts = CQ_COUNTS; +%sp_counts = SP_COUNTS; +%cq_sumcounts = CQ_SUMCOUNTS; +%sp_sumcounts = SP_SUMCOUNTS; + + +i1_logml = POP_LOGML(i1); + +for pop2 = 1:npops2 + % inds = inds2(find(T2==pop2)); + inds = inds2(logical(T2==pop2)); + ninds = length(inds); + if ninds>0 + indCqCounts = uint16(sum(counts_cq(:,:,inds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,inds),3)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; + CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; + SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + + new_i1_logml = computePopulationLogml(i1, adjprior_cq, adjprior_sp); + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)+indCqCounts; + CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)+sumCq; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)+indSpCounts; + SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)+sumSp; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); + CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); + SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ repmat(sumSp,[npops-1 1]); + + new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp)'; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); + CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); + SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)- repmat(sumSp,[npops-1 1]); + + changes(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + +%-------------------------------------------------------------------------- + +function changes = computeChanges5(inds, i1, i2, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Computes change in logml if individual of inds is moved between +% populations i1 and i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; global PARTITION; + +ninds = length(inds); +changes = zeros(ninds,1); + +i1_logml = POP_LOGML(i1); +i2_logml = POP_LOGML(i2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %from + pop2 = i2; %to + else + pop1 = i2; + pop2 = i1; + end + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,pop1) = CQ_COUNTS(:,:,pop1)-indCqCounts; + CQ_SUMCOUNTS(pop1,:) = CQ_SUMCOUNTS(pop1,:)-sumCq; + SP_COUNTS(:,:,pop1) = SP_COUNTS(:,:,pop1)-indSpCounts; + SP_SUMCOUNTS(pop1,:) = SP_SUMCOUNTS(pop1,:) - sumSp; + + CQ_COUNTS(:,:,pop2) = CQ_COUNTS(:,:,pop2)+indCqCounts; + CQ_SUMCOUNTS(pop2,:) = CQ_SUMCOUNTS(pop2,:)+sumCq; + SP_COUNTS(:,:,pop2) = SP_COUNTS(:,:,pop2)+indSpCounts; + SP_SUMCOUNTS(pop2,:) = SP_SUMCOUNTS(pop2,:) + sumSp; + + new_logmls = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + changes(i) = sum(new_logmls); + + CQ_COUNTS(:,:,pop1) = CQ_COUNTS(:,:,pop1)+indCqCounts; + CQ_SUMCOUNTS(pop1,:) = CQ_SUMCOUNTS(pop1,:)+sumCq; + SP_COUNTS(:,:,pop1) = SP_COUNTS(:,:,pop1)+indSpCounts; + SP_SUMCOUNTS(pop1,:) = SP_SUMCOUNTS(pop1,:)+sumSp; + CQ_COUNTS(:,:,pop2) = CQ_COUNTS(:,:,pop2)-indCqCounts; + CQ_SUMCOUNTS(pop2,:) = CQ_SUMCOUNTS(pop2,:)-sumCq; + SP_COUNTS(:,:,pop2) = SP_COUNTS(:,:,pop2)-indSpCounts; + SP_SUMCOUNTS(pop2,:) = SP_SUMCOUNTS(pop2,:)-sumSp; +end + +changes = changes - i1_logml - i2_logml; + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, indCqCounts, indSpCounts, ... + adjprior_cq, adjprior_sp) +% Updates global variables when individual ind is moved to population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+indCqCounts; +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+sumCq; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+indSpCounts; +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+sumSp; + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2(i1, i2, adjprior_cq, adjprior_sp) +% Updates global variables when all individuals from population i1 are moved +% to population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +% inds = find(PARTITION==i1); +% PARTITION(inds) = i2; +PARTITION(logical(PARTITION==i1)) = i2; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+CQ_COUNTS(:,:,i1); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+CQ_SUMCOUNTS(i1,:); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+SP_COUNTS(:,:,i1); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+SP_SUMCOUNTS(i1,:); + +CQ_COUNTS(:,:,i1) = 0; +CQ_SUMCOUNTS(i1,:) = 0; +SP_COUNTS(:,:,i1) = 0; +SP_SUMCOUNTS(i1,:) = 0; + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, i2, indCqCounts, indSpCounts, ... + adjprior_cq, adjprior_sp) +% Updates global variables when individuals muuttuvat are moved to +% population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +global LOGDIFF; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+indCqCounts; +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+sumCq; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+indSpCounts; +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+sumSp; + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Returns individuals inds in order according to the change in the logml if +% they are moved out of the population pop + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind = inds(i); + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,pop) = CQ_COUNTS(:,:,pop)-indCqCounts; + CQ_SUMCOUNTS(pop,:) = CQ_SUMCOUNTS(pop,:)-sumCq; + SP_COUNTS(:,:,pop) = SP_COUNTS(:,:,pop)-indSpCounts; + SP_SUMCOUNTS(pop,:) = SP_SUMCOUNTS(pop,:)-sumSp; + + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior_cq, adjprior_sp); + + CQ_COUNTS(:,:,pop) = CQ_COUNTS(:,:,pop)+indCqCounts; + CQ_SUMCOUNTS(pop,:) = CQ_SUMCOUNTS(pop,:)+sumCq; + SP_COUNTS(:,:,pop) = SP_COUNTS(:,:,pop)+indSpCounts; + SP_SUMCOUNTS(pop,:) = SP_SUMCOUNTS(pop,:)+sumSp; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + + +%-------------------------------------------------------------------------- + +function clearGlobalVars + +global CQ_COUNTS; CQ_COUNTS = []; +global CQ_SUMCOUNTS; CQ_SUMCOUNTS = []; +global SP_COUNTS; SP_COUNTS = []; +global SP_SUMCOUNTS; SP_SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; +global LOGDIFF; LOGDIFF = []; + +%-------------------------------------------------------------------------- + +function npops = removeEmptyPops +% Removes empty pops from all global COUNTS variables. +% Updates PARTITION and npops + +global CQ_COUNTS; +global CQ_SUMCOUNTS; +global SP_COUNTS; +global SP_SUMCOUNTS; +global PARTITION; +global LOGDIFF; + +notEmpty = find(any(CQ_SUMCOUNTS,2)); +CQ_COUNTS = CQ_COUNTS(:,:,notEmpty); +CQ_SUMCOUNTS = CQ_SUMCOUNTS(notEmpty,:); +SP_COUNTS = SP_COUNTS(:,:,notEmpty); +SP_SUMCOUNTS = SP_SUMCOUNTS(notEmpty,:); +LOGDIFF = LOGDIFF(:,notEmpty); + +for n=1:length(notEmpty) +% apu = find(PARTITION==notEmpty(n)); +% PARTITION(apu)=n; +PARTITION(logical(PARTITION==notEmpty(n))) = n; +end +npops = length(notEmpty); + +%-------------------------------------------------------------------------- + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, että annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ssä ei vielä ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyistä partitiota vastaava nclusters:in arvo. Muutoin ei tehdä mitään. +global PARTITION; +apu = isempty(find(abs(partitionSummary(:,2)-logml)<1e-5,1)); +if apu + % Nyt löydetty partitio ei ole vielä kirjattuna summaryyn. + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + +%-------------------------------------------------------------------------- + +function [counts, sumcounts] = initialCounts(ind_counts) + +global PARTITION; + +pops = unique(PARTITION); +npops = max(pops); + +counts = zeros(size(ind_counts,1), size(ind_counts,2), npops,'uint16'); +sumcounts = zeros(npops, size(ind_counts,2),'uint16'); + +for i = 1:npops + inds = find(PARTITION == i); + counts(:,:,i) = sum(ind_counts(:,:,inds), 3); + sumcounts(i,:) = sum(counts(:,:,i),1); +end + +%-------------------------------------------------------------------------- + +function logml = computeLogml(adjprior_cq, adjprior_sp) + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +cq_counts = double(CQ_COUNTS); +cq_sumcounts = double(CQ_SUMCOUNTS); +sp_counts = double(SP_COUNTS); +sp_sumcounts = double(SP_SUMCOUNTS); + +npops = size(CQ_COUNTS, 3); + +cq_logml = sum(sum(sum(gammaln(cq_counts+repmat(adjprior_cq,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior_cq))) - ... + sum(sum(gammaln(1+cq_sumcounts))); + +sp_logml = sum(sum(sum(gammaln(sp_counts+repmat(adjprior_sp,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior_sp))) - ... + sum(sum(gammaln(1+sp_sumcounts))); + +logml = cq_logml - sp_logml; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%-------------------------------------------------------------------------- + +function popLogml = computePopulationLogml(pops, adjprior_cq, adjprior_sp) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +cq_counts = double(CQ_COUNTS); +cq_sumcounts = double(CQ_SUMCOUNTS); +sp_counts = double(SP_COUNTS); +sp_sumcounts = double(SP_SUMCOUNTS); + +nall_cq = size(CQ_COUNTS,1); +nall_sp = size(SP_COUNTS, 1); +ncliq = size(CQ_COUNTS,2); +nsep = size(SP_COUNTS, 2); + +z = length(pops); + +popLogml_cq = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_cq,[1 1 z]) + cq_counts(:,:,pops)) ... + ,[nall_cq ncliq z]),1),2)) - sum(gammaln(1+cq_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_cq))); + +popLogml_sp = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_sp,[1 1 z]) + sp_counts(:,:,pops)) ... + ,[nall_sp nsep z]),1),2)) - sum(gammaln(1+sp_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_sp))); + +popLogml = popLogml_cq - popLogml_sp; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%------------------------------------------------------------------- + + + + +%-------------------------------------------------------------- +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +% newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) && n= 1 + suurinYks = suurinYks+1; + end + if suurinYks<10 + mjono(7) = num2str(suurinYks); + mjono(6) = 'e'; + mjono(5) = palautaYks(abs(logml),suurinYks-1); + mjono(4) = '.'; + mjono(3) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(2) = '-'; + end + elseif suurinYks>=10 + mjono(6:7) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(logml),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(1) = '-'; + end + end +end + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:inä +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmillä +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhjiä +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end + +%-------------------------------------------------------------------------- + + diff --git a/matlab/linkage/linkageMixture_speed.m b/matlab/linkage/linkageMixture_speed.m new file mode 100644 index 0000000..e393726 --- /dev/null +++ b/matlab/linkage/linkageMixture_speed.m @@ -0,0 +1,2130 @@ +function linkageMixture_speed +base = findobj('Tag','base_figure'); + + +% check whether fixed k mode is selected +h0 = findobj('Tag','fixk_menu'); +fixedK = get(h0, 'userdata'); + +if fixedK + if ~(fixKWarning == 1) % call function fixKWarning + return + end +end + +% check whether partition compare mode is selected +h1 = findobj('Tag','partitioncompare_menu'); +partitionCompare = get(h1, 'userdata'); + +% Data handling +input_type = questdlg('Specify the format of your data: ',... + 'Specify Data Format', ... + 'MLST-format', 'BAPS-format','Pre-processed data', 'MLST-format'); + +switch input_type + + case 'MLST-format' + + %waitALittle; + mlst_type = questdlg('Choose data type: ',... + 'Specify MLST format', ... + 'Separate allelic profiles(TXT)', ... + 'Concatenate allelic sequences(EXCEL)','Separate allelic profiles(TXT)'); + switch mlst_type + case 'Concatenate allelic sequences(EXCEL)' + %waitALittle + setWindowOnTop(base,'false') + [filename,pathname] = uigetfile('*.xls','Load new concatenate sequence profile(*.xls)'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + display('---------------------------------------------------'); + display(['Reading sequence profile from: ',[pathname filename],'...']); + + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); + [data, component_mat, popnames] = processxls([pathname filename]); + + if isempty(data) + display('*** ERROR: Failed in loading the data'); + return; + end + + case 'Separate allelic profiles(TXT)' + % Ask the allelic profile + setWindowOnTop(base,'false') + [filename,pathname] = uigetfile('*.pl;*.txt','Load new allelic profile(*.pl, *.txt)'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + display('---------------------------------------------------'); + display(['Reading allelic profile from: ',[pathname filename],'...']); + + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); + + % Preprocess the profile + output = processprofile([pathname filename]); + headercount = size(output,2); + flag = zeros(1,headercount); + for i = 1:headercount + if strcmpi('ST',output{1,i}) + flag(i) = 1; + else + if strcmpi('Isolate', output{1,i}) || strcmpi('Strain',output{1,i}) + flag(i) = 2; + else + if strcmpi('Species', output{1,i}) + flag(i) = 3; + end + end + end + end + + if ~any(flag) + h = errordlg(['Loading of the specified file was unsuccessful. ' ... + 'Please see the tutorial to find out the correct file ' ... + 'format.'] ,'Error','modal'); + handle = [h,base]; + setWindowOnTop(handle,{'true','true'}) + + uiwait(h); + fprintf(1,'\n*** ERROR: Failed in loading the allelic profile.\n'); + return; + end + + index = (1:size(output,1)-1)'; + + + % Species selection + species_loc = find(flag==3); + if ~isempty(species_loc) + species_col = output((2:end), species_loc); + [species_str, m, n] = unique(species_col); + + [s1,v1] = listdlg('PromptString','Select species:',... + 'SelectionMode','multiple',... + 'Name','Select Species',... + 'ListString',species_str'); + removed = setdiff(n,s1); + + if ~v1 || isempty(s1) + dispCancel;return + end + else + n = index; + removed = 0; + end + + % Isolate/Strain selection + isolate_loc = find(flag==2); + if ~isempty(isolate_loc) + isolate_col = output((2:end), isolate_loc); + isolate_str = isolate_col(logical(~ismember(n,removed))); + index(ismember(n,removed)) = []; + + [s2,v2] = choosebox('Name','Select Isolates','PromptString',... + 'Isolates in the sample(Ctrl+A to select all):','SelectString', ... + 'Isolates you have selected','ListString', isolate_str', ... + 'InitialValue',1); + if isempty(s2) || ~v2 + dispCancel; return + end + else + % ST selection + isolate_loc = find(flag==1); + isolate_col = output((2:end), isolate_loc); + % isolate_str = isolate_col(find(~ismember(n,removed))); + isolate_str = isolate_col(logical(~ismember(n,removed))); + index(ismember(n,removed)) = []; + + [s2,v2] = choosebox('Name','Select STs','PromptString',... + 'STs in the sample (Ctrl+A to select all):','SelectString', ... + 'STs you have selected','ListString', isolate_str'); + if isempty(s2) || ~v2 + dispCancel; return + end + end + + % Read the data + isolate_index = index(s2); + if ~isempty(isolate_loc) + popnames(:,1) = isolate_str(s2); + else + popnames(:,1) = num2cell(index(s2)); + end + popnames(:,2) = num2cell([1:length(isolate_index)]'); + + % remove empty elements + strmat = output([isolate_index+1]',find(flag==0)); + [i,j] = find(cellfun('isempty',strmat)); + ij = [i j]; + for k=1:length(i) + strmat(ij(k,1),ij(k,2))={''}; + end + [i,j] = find(strcmp(strmat,'')); + ij = [i j]; + for k=1:length(i) + strmat(ij(k,1),ij(k,2))={'0'}; + end + + data_allele = zeros(size(strmat)); + for i = 1:size(strmat,1) + for j = 1:size(strmat,2) + data_allele(i,j) = str2num(char(strmat(i,j))); + end + end + + % remove columns containing empty values + realgene = find(all(data_allele)); + data_allele = data_allele(:,realgene); + partition_index = 1:size(strmat,1); + data_allele = [data_allele partition_index']; + genename = output(1,find(flag==0)); + genename = genename(realgene); + + + if isempty(genename) || isempty(data_allele) + msgbox(['Loading of the specified file was unsuccessful. ' ... + 'Please see the tutorial to find out the correct file ' ... + 'format.'] ,'Error', ... + 'error'); + fprintf(1,'\n*** ERROR: Failed in loading the allelic profile.\n'); + return; + else + display('---------------------------------------------------'); + display(['# of strains: ', num2str(size(data_allele,1))]); + display(['# of genes: ', num2str(size(data_allele,2)-1)]); + end + + %waitALittle; + % Ask the individual gene sequence + % [isOK,returnvalue] = askSeq; + % if (isOK & returnvalue~= 1) % allelic profile loaded only + % data = data_allele; + % component_mat = [1:size(data,2)-1]'; % assume independence + % elseif + % isOK & returnvalue == 1, + [s3,v3] = listdlg('PromptString','Select genes:',... + 'SelectionMode','multiple',... + 'Name','Select Genes',... + 'ListString',genename); + + if isempty(s3) || ~v3 + dispCancel; + return + else + m = size(s3,2); % number of genes + % data_seq = cell(m,1); + data = []; + genesize = zeros(1,m); + for i=1:m + %waitALittle; + data_gene = readfasta(genename{s3(i)}); % read fasta + if (isempty(data_gene)) + display('*** ERROR: Failed in loading the sequence data'); + return; + else + data_gene(:,find(sum(data_gene)==0)) = []; % NB! remove all the gaps + % data_seq{i} = data_gene; + selected_data = data_gene(data_allele(:,s3(i))',:); % Store only those selected strains + + emptyloci = find(all(selected_data(:,[1:end])<0)); + if ~isempty(emptyloci) + disp('Removing empty loci...'); + end + selected_data(:,emptyloci) = []; % remove empty loci + data = [data selected_data]; + end + genesize(i) = size(selected_data,2); % NB! could be different than the original gene length + end + data = [data data_allele(:,end)]; % add the index + % determine the component matrix + component_mat = zeros(m,max(genesize)); + cum = cumsum(genesize); + component_mat(1,1:genesize(1)) = 1:cum(1); + for i=2:m + component_mat(i,1:genesize(i)) = (cum(i-1)+1):cum(i); + end + end + % elseif isOK ==0 + % return + % end + otherwise + return + end + + %waitALittle; + display('---------------------------------------------------'); + fprintf(1,'Preprocessing the data ...'); + + % Make the missing data complete + % missing values are denoted as -999 + data = uint16(data); + % data = uint8(data); + data = makecomplete(data); + if isempty(data) + display('*** ERROR: Failed in completing the missing data'); + return; + end + + isRational = isTheLoadedNewDataRational(data); + if isRational == 0 + return; + end + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(data); + + + % Distance between individuals is computed as if the loci are + % independent. + [Z,dist] = newGetDistances(data,rowsFromInd); + fprintf(1,'Finished.\n'); + ninds = max(data(:,end)); + popnames = fixPopnames(popnames, ninds); + + c.data = uint16(data); c.rowsFromInd = rowsFromInd; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.popnames = popnames; c.component_mat = component_mat; + c.dist = dist; c.Z = Z; + + %waitALittle; + save_preproc = questdlg('Do you wish to save the pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + else + kokonimi = [pathname filename]; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + end + end; + + %waitALittle; + linkage_model = questdlg('Specify the linkage model',... + 'Specify the linkage model?',... + 'Linear','Codon', 'Independent', 'Linear'); + if isequal(linkage_model,'Linear') + linkage_model = 'linear'; + display('Linear model was selected.'); + elseif isequal(linkage_model,'Codon') + linkage_model = 'codon'; + display('Codon model was selected.'); + elseif isequal(linkage_model,'Independent') + display('Independent model was selected.'); + c.data = double(c.data); + greedyMix(c); + return; + else + dispCancel; + return; + end; + + % Data transformation + fprintf(1,'Transforming the data ...'); + index = data(:,end); + [data_clique, data_separator, noalle_clique, noalle_separator] = ... + transform4(data, component_mat, linkage_model); + data_clique = [data_clique index]; + data_separator = [data_separator index]; + + % Count the data + [counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq]... + = allfreqsnew2(data_clique, double(noalle_clique)); + [counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp]... + = allfreqsnew2(data_separator, double(noalle_separator)); + + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + fprintf(1,'Finished.\n'); + + clear data_clique data_separator + + save_preproc = questdlg('Do you wish to save the fully pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %waitALittle; + [filename, pathname] = uiputfile('*.mat','Save fully pre-processed data as'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + else + kokonimi = [pathname filename]; + c.counts_cq = counts_cq; c.adjprior_cq = adjprior_cq; + c.counts_sp = counts_sp; c.adjprior_sp = adjprior_sp; + c.linkage_model = linkage_model; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + end + end; + clear c; + + case 'BAPS-format' + + input_type = questdlg('Specify the format of your data: ',... + 'Specify BAPS Format', ... + 'BAPS sequence data', 'BAPS numeric data', 'BAPS sequence data'); + switch input_type + case 'BAPS numeric data' + %waitALittle; + setWindowOnTop(base,'false') + [filename,pathname] = uigetfile('*.txt', 'Load BAPS numeric data'); + if (sum(filename)==0) || (sum(pathname)==0) + %cancel was pressed; do nothing. + return; + end; + + display('---------------------------------------------------'); + display(['Reading BAPS numeric data from: ',[pathname filename],'...']); + + try + data = load([pathname filename]); + catch + disp('*** ERROR: Incorrect BAPS numerical data.'); + return + end + case 'BAPS sequence data' + waitALittle;%waitALittle; + [data, filename] = readbaps; + if isempty(data) + return + end + otherwise + return; + end + + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + waitALittle;%waitALittle; + input_pops = questdlg(['When using data which are in BAPS-format, '... + 'you can specify the sampling populations of the individuals by '... + 'giving two additional files: one containing the names of the '... + 'populations, the other containing the indices of the first '... + 'individuals of the populations. Do you wish to specify the '... + 'sampling populations?'], ... + 'Specify sampling populations?',... + 'Yes', 'No', 'No'); + if isequal(input_pops,'Yes') + %waitALittle; + display('Reading name and index files...'); + setWindowOnTop(base,'false') + [namefile, namepath] = uigetfile('*.txt', 'Load population names'); + if namefile==0 + kysyToinen = 0; + else + kysyToinen = 1; + end + if kysyToinen==1 + %waitALittle; + setWindowOnTop(base,'false') + [indicesfile, indicespath] = uigetfile('*.txt', 'Load population indices'); + if indicesfile==0 + % popnames = []; + dispCancel; + return + else + popnames = initPopNames([namepath namefile],[indicespath indicesfile]); + end + else + % popnames = []; + dispCancel; + return + end + else + % popnames = []; + popnames(:,1) = num2cell(unique(data(:,end))); + popnames(:,2) = popnames(:,1); + ninds = max(data(:,end)); + popnames = fixPopnames(popnames, ninds); + end + + % check that popnames is correct + if isempty(popnames) + display('*** ERROR: error in reading popnames.') + return + end + + data = uint16(data); + % Check that the data is rational: + isRational = isTheLoadedNewDataRational(data); + if isRational == 0 + msgbox(['Loaded file contained incorrect data. The last column of the ' ... + 'data file must contain sampling unit identifiers. Identifier specifies the ' ... + 'unit from which the genetic data on that particular row was collected. ' ... + 'Identifiers must be positive integers. If the biggest unit identifier is i.e. 27, there must ' ... + 'be at least one row for each unit 1-27'] ,'Error', ... + 'error'); + disp('*** ERROR: Failed in loading the BAPS data.'); + return; + else + display(['# of haplotypes: ', num2str(size(data,1))]); + display(['# of loci: ', num2str(size(data,2)-1)]); + % display('Finished.'); + end; + + % Check if the data is discrete or continuous + if any(any(fix(data)~=data)) + disp('Found decimal numbers. Continuous model will be used.'); +% input_type = questdlg('Choose the method for the continuous data: ',... +% 'Specify the method', ... +% 'BEC', 'Gibbs sampling with WINBUGS','BEC'); +% switch input_type +% case 'BEC' +% disp('Using the BEC mixture model...'); +% becMixture(data, popnames); +% case 'Gibbs sampling with WINBUGS' +% disp('Preparing the WINBUGS code...'); +% isok = makeBUGS(data); +% if isok disp('Finished.'); +% +% end +% otherwise +% return +% end + disp('** CANCELLED: continuous model is under construction.'); + return; + end + + display('---------------------------------------------------'); + fprintf(1,'Preprocessing the data ...'); + % Make the missing data complete + data = makecomplete(data); + if isempty(data) + display('*** ERROR: Failed in completing the missing data'); + return; + end + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(data); + + % Distance between individuals is computed as if the loci are + % independent. + [Z,dist] = newGetDistances(data,rowsFromInd); + fprintf(1,'Finished.\n'); + + c.data = uint16(data); c.rowsFromInd = rowsFromInd; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.dist = dist; c.popnames = popnames; + c.Z = Z; + + input_linkage = questdlg('Do you wish to load the linkage map?',... + 'Load Linkage Map',... + 'Yes','No','Yes'); + if isequal(input_linkage,'Yes'); + display('---------------------------------------------------'); + %%waitALittle; + setWindowOnTop(base,'false') + [linkage_filename, linkage_pathname] = uigetfile('*.txt', 'Load Linkage Map'); + + if isempty(linkage_filename) && isempty(linkage_pathname) + return; + else + display(['Reading linkage map from: ',[linkage_pathname linkage_filename],'...']); + end; + + try + component_mat = load([linkage_pathname linkage_filename]); + catch + disp('*** ERROR: Incorrect linkage map.'); + return; + end + + % Check if the linkage map matches the data + if (size(data,2)-1) ~= max(component_mat(:)) + msgbox(['Loading of the specified file was unsuccessful. ' ... + 'The linkage map dose not match with the data.'] ,'Error', ... + 'error'); + disp('*** ERROR: Failed in loading the linkage map.'); + return; + else + display(['# of linkage groups: ', num2str(size(component_mat,1))]); + end; + display('---------------------------------------------------'); + h0 = findobj('Tag','filename1_text'); + set(h0,'String',[filename '/' linkage_filename]); clear h0; + c.component_mat = component_mat; + else + display('Independent model was selected.'); + c.data = double(c.data); + greedyMix(c); + return; + end + + + save_preproc = questdlg('Do you wish to save the pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %%waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + kokonimi = [pathname filename]; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + end; + + linkage_model = questdlg('Specify the linkage model',... + 'Specify the linkage model?',... + 'Linear','Codon', 'Independent', 'Linear'); + if isequal(linkage_model,'Linear') + linkage_model = 'linear'; + display('Linear model was selected.'); + elseif isequal(linkage_model,'Codon') + linkage_model = 'codon'; + display('Codon model was selected.'); + elseif isequal(linkage_model,'Independent') + display('Independent model was selected.'); + c.data = double(c.data); + greedyMix(c); + return; + else + dispCancel; + return; + end; + + % Data transformation + % display('---------------------------------------------------'); + fprintf(1,'Transforming the data ...'); + index = data(:,end); + [data_clique, data_separator, noalle_clique, noalle_separator] = ... + transform4(data, component_mat, linkage_model); + data_clique = [data_clique index]; + data_separator = [data_separator index]; + + [counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq]... + = allfreqsnew2(data_clique, double(noalle_clique)); + clear data_clique; + [counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp]... + = allfreqsnew2(data_separator, double(noalle_separator)); + clear data_separator; + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + fprintf(1,'Finished.\n'); + + save_preproc = questdlg('Do you wish to save the fully pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %waitALittle; + [filename, pathname] = uiputfile('*.mat','Save fully pre-processed data as'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + kokonimi = [pathname filename]; + c.counts_cq = counts_cq; c.adjprior_cq = adjprior_cq; + c.counts_sp = counts_sp; c.adjprior_sp = adjprior_sp; + c.linkage_model = linkage_model; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + end; + clear c; + + case 'Pre-processed data' + % This is basically the same format as the "Pre-processed data" in + % the basic clustering. The only difference is that the file + % includes also the component_mat + % %waitALittle; + setWindowOnTop(base,'false') + [filename, pathname] = uigetfile('*.mat', 'Load pre-processed data'); + if filename==0 + return; + end + display('---------------------------------------------------'); + display(['Reading preprocessed data from: ',[pathname filename],'...']); + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + + struct_array = load([pathname filename]); + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'dist') + display('*** ERROR: Incorrect file format'); + return + end + clear struct_array; + elseif isfield(struct_array,'dist') %Mideva versio + c = struct_array; + clear struct_array; + else + display('*** ERROR: Incorrect file format'); + return; + end + + % The following are the same as in the basic clustering + data = c.data; popnames = c.popnames; Z = c.Z; + noalle = c.noalle; adjprior = c.adjprior; + rowsFromInd = c.rowsFromInd; alleleCodes = c.alleleCodes; + dist = c.dist; priorTerm = c.priorTerm; + + if ~isfield(c,'component_mat') + display('*** ERROR: Incorrect file format'); + return + end + + % This is new + component_mat = c.component_mat; + data = uint16(data); + + display(['# of haplotypes: ', num2str(size(data,1))]); + display(['# of loci: ', num2str(size(data,2)-1)]); + display(['# of linkage groups: ', num2str(size(component_mat,1))]); + + if ~isfield(c, 'linkage_model') + %%%waitALittle; + % Independent is not an option, since it can be computed with the + % basic clustering which is much faster + linkage_model = questdlg('Specify the linkage model',... + 'Specify the linkage model?',... + 'Linear','Codon','Linear'); + if isequal(linkage_model,'Linear') + linkage_model = 'linear'; + display('Linear model was selected.'); + elseif isequal(linkage_model,'Codon') + linkage_model = 'codon'; + display('Codon model was selected.'); + else + dispCancel; + return; + end; + + clear c; % save the memory usage + + % Data transformation + fprintf(1,'Transforming the data ...'); + index = data(:,end); +% [data_clique, data_separator, noalle_clique, noalle_separator] = ... +% transform2(data, component_mat, linkage_model); + [data_clique, data_separator, noalle_clique, noalle_separator] = ... + transform4(data, component_mat, linkage_model); + data_clique = [data_clique index]; + data_separator = [data_separator index]; + + [counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq]... + = allfreqsnew2(data_clique, double(noalle_clique)); + clear data_clique; + [counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp]... + = allfreqsnew2(data_separator, double(noalle_separator)); + clear data_separator; + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + fprintf(1,'Finished.\n'); + + save_preproc = questdlg('Do you wish to save the fully pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %%%waitALittle; + [filename, pathname] = uiputfile('*.mat','Save fully pre-processed data as'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + + kokonimi = [pathname filename]; c.data = data; + c.counts_cq = counts_cq; c.adjprior_cq = adjprior_cq; + c.counts_sp = counts_sp; c.adjprior_sp = adjprior_sp; + c.linkage_model = linkage_model; + c.rowsFromInd = rowsFromInd; + c.alleleCodes = alleleCodes; + c.noalle = noalle; + c.adjprior = adjprior; + c.priorTerm = priorTerm; + c.popnames = popnames; + c.component_mat = component_mat; + c.dist = dist; c.Z = Z; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + end; + clear c; + else + %Linkage model is specified in the preprocessed file. + counts_cq = c.counts_cq; adjprior_cq = c.adjprior_cq; + counts_sp = c.counts_sp; adjprior_sp = c.adjprior_sp; + linkage_model = c.linkage_model; + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + clear c; + display(['linkage model: ', linkage_model]); + end + + otherwise + return; +end + + +global POP_LOGML; global PARTITION; +global CQ_COUNTS; global SP_COUNTS; %These counts are for populations +global CQ_SUMCOUNTS; global SP_SUMCOUNTS; %not for individuals +clearGlobalVars; + +c.noalle = noalle; +c.adjprior = adjprior; %priorTerm = c.priorTerm; +c.rowsFromInd = rowsFromInd; +c.counts_cq = counts_cq; +c.adjprior_cq = adjprior_cq; +c.counts_sp = counts_sp; +c.adjprior_sp = adjprior_sp; +c.dist = dist; c.Z = Z; + +% partition compare mode +if ~isempty(partitionCompare) + partitions = partitionCompare.partitions; + npartitions = size(partitions,2); + partitionLogml = zeros(1,npartitions); + for i = 1:npartitions + % number of unique partition lables + try + [cq_counts, cq_sumcounts] = ... + initialCounts(counts_cq, partitions(:,i)); + [sp_counts, sp_sumcounts] = initialCounts(counts_sp, partitions(:,i)); + partitionLogml(i) = computeLogml(adjprior_cq, adjprior_sp, ... + cq_counts, cq_sumcounts, ... + sp_counts, sp_sumcounts); + catch + disp('*** ERROR: unmatched data.'); + return + end + end + % return the logml result + partitionCompare.logmls = partitionLogml; + set(h1, 'userdata', partitionCompare); + return +end + +if fixedK + [logml, npops, partitionSummary] = linkageMix_fixK(c); +else + [logml, npops, partitionSummary] = linkageMix(c); +end + +if logml==1 + return; +end + +h0 = findobj('Tag','filename1_text'); inp = get(h0,'String'); +h0 = findobj('Tag','filename2_text'); +outp = get(h0,'String'); + +clear c; % save the memory +%This is basically the same as in BAPS 3. +changesInLogml = writeMixtureInfo(logml, counts_cq, counts_sp, adjprior_cq, ... + adjprior_sp, outp, inp, partitionSummary, popnames, linkage_model, ... + fixedK); + +viewMixPartition(PARTITION, popnames); + +% --------------------------------------------------------------------- +% Save the result. +% Jing - 26.12.2005 +talle = questdlg(['Do you want to save the mixture populations ' ... + 'so that you can use them later in admixture analysis?'], ... + 'Save results?','Yes','No','Yes'); +if isequal(talle,'Yes') + %%%waitALittle; + [filename, pathname] = uiputfile('*.mat','Save results as'); + + if (sum(filename)==0) || (sum(pathname)==0) + % Cancel was pressed + return + else % copy 'baps4_output.baps' into the text file with the same name. + if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',[pathname filename '.txt']) + delete('baps4_output.baps') + end + end + + [sumcounts, counts] = indLociCounts(PARTITION, data, npops, noalle); + % NB! Index column is removed in data matrix. + c.PARTITION = PARTITION; c.CQ_COUNTS = CQ_COUNTS; c.CQ_SUMCOUNTS = CQ_SUMCOUNTS; + c.SP_COUNTS = SP_COUNTS; c.SP_SUMCOUNTS = SP_SUMCOUNTS; + c.alleleCodes = alleleCodes; c.adjprior_cq = adjprior_cq; c.adjprior_sp = adjprior_sp; c.popnames = popnames; + c.rowsFromInd = rowsFromInd; c.data = uint16(data); c.npops = npops; + % c.nalleles_cq = nalleles_cq; c.nalleles_sp = nalleles_sp; + if strcmp(linkage_model,'linear') % Added on 03.11.06 + c.mixtureType = 'linear_mix'; + elseif strcmp(linkage_model,'codon') + c.mixtureType = 'codon_mix'; + end + c.changesInLogml = changesInLogml; % this variable stores the change of likelihoods. + % [ncluster ninds] + % -Added on 02.11.2006 + + % The next ones are for the admixture input + c.COUNTS = counts; c.SUMCOUNTS = sumcounts; + c.adjprior = adjprior; c.rowsFromInd = rowsFromInd; c.noalle = noalle; c.alleleCodes = alleleCodes; + + % The two variables below are for the new linkage admixture model + c.gene_lengths = calcGeneLengths(component_mat); + + % The logml is saved for parallel computing + c.logml = logml; + + fprintf(1,'Saving the result...') + try +% save([pathname filename], 'c'); + save([pathname filename], 'c', '-v7.3'); % added by Lu Cheng, 08.06.2012 + fprintf(1,'Finished.\n'); + catch + display('*** ERROR in saving the result.'); + end +else + if exist('baps4_output.baps','file') + delete('baps4_output.baps') + end +end +% ----------------------------------------------------------------------- + + +%-------------------------------------------------------------------------- +% The next three functions are for computing the initial partition +% according to the distance between the individuals + +function initial_partition=admixture_initialization(nclusters,Z) +T=cluster_own(Z,nclusters); +initial_partition=T; + +%-------------------------------------------------------------------------- +function T = cluster_own(Z,nclust) +% true=logical(1); +% false=logical(0); + +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); +% maximum number of clusters based on inconsistency +if m <= maxclust + T = (1:m)'; +elseif maxclust==1 + T = ones(m,1); +else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end +end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + +%-------------------------------------------------------------------------- + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäär? + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + +%-------------------------------------------------------------------------- + +function Z = computeLinkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 || m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +% monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + +%-------------------------------------------------------------------------- + +function changes = computeChanges(ind, adjprior_cq, adjprior_sp, ... + indCqCounts, indSpCounts) +% Computes changes in log-marginal likelihood if individual ind is +% moved to another population +% +% Input: +% ind - the individual to be moved +% adjprior_cq & _sp - adjpriors for cliques and separators +% indCqCounts, indSpCounts - counts for individual ind +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% ind is move to population i. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(CQ_COUNTS,3); +changes = zeros(npops,1); + +i1 = PARTITION(ind); +i1_logml = POP_LOGML(i1); +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +new_i1_logml = computePopulationLogml(i1, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)+indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)+sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)+indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)+sumSp; + + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:) + repmat(sumSp,[npops-1 1]); + +new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:) - repmat(sumSp,[npops-1 1]); +% a = repmat(sumSp,[npops-1 1]); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + +%------------------------------------------------------------------------------------ + +function changes = computeChanges2(i1, adjprior_cq, adjprior_sp) +% Computes changes in log marginal likelihood if population i1 is combined +% with another population +% +% Input: +% i1 - the population to be combined +% adjprior_cq & _sp - adjpriors for cliques and separators +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% i1 is combined with population i. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; +npops = size(CQ_COUNTS,3); +changes = zeros(npops,1); + +i1_logml = POP_LOGML(i1); +indCqCounts = CQ_COUNTS(:,:,i1); +indSpCounts = SP_COUNTS(:,:,i1); +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +new_i1_logml = 0; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ repmat(sumSp,[npops-1 1]); +% a = repmat(sumSp,[npops-1 1]); +% if ~any(sumSp) +% a(:,[1:size(a,2)])=[]; +% end +% SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ a ; + + +new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)- repmat(sumSp,[npops-1 1]); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + + + +%------------------------------------------------------------------------------------ + + +function changes = computeChanges3(T2, inds2, i1, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Computes changes in log marginal likelihood if subpopulation of i2 is +% moved to another population +% +% Input: +% T2 - partition of inds2 to subpopulations +% inds2 - individuals in population i1 +% i2 +% counts_cq, counts_sp - counts for individuals +% +% Output: +% changes - table of size length(unique(T2))*npops. +% changes(i,j) = difference in logml if subpopulation inds2(find(T2==i)) of +% i2 is moved to population j + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; +npops = size(CQ_COUNTS,3); +npops2 = length(unique(T2)); +changes = zeros(npops2,npops); + +%cq_counts = CQ_COUNTS; +%sp_counts = SP_COUNTS; +%cq_sumcounts = CQ_SUMCOUNTS; +%sp_sumcounts = SP_SUMCOUNTS; + + +i1_logml = POP_LOGML(i1); + +for pop2 = 1:npops2 + % inds = inds2(find(T2==pop2)); + inds = inds2(logical(T2==pop2)); + ninds = length(inds); + if ninds>0 + indCqCounts = uint16(sum(counts_cq(:,:,inds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,inds),3)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; + CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; + SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + + new_i1_logml = computePopulationLogml(i1, adjprior_cq, adjprior_sp); + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)+indCqCounts; + CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)+sumCq; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)+indSpCounts; + SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)+sumSp; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); + CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); + SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ repmat(sumSp,[npops-1 1]); + + new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp)'; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); + CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); + SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)- repmat(sumSp,[npops-1 1]); + + changes(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + +%-------------------------------------------------------------------------- + +function changes = computeChanges5(inds, i1, i2, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Computes change in logml if individual of inds is moved between +% populations i1 and i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; global PARTITION; + +ninds = length(inds); +changes = zeros(ninds,1); + +i1_logml = POP_LOGML(i1); +i2_logml = POP_LOGML(i2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %from + pop2 = i2; %to + else + pop1 = i2; + pop2 = i1; + end + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,pop1) = CQ_COUNTS(:,:,pop1)-indCqCounts; + CQ_SUMCOUNTS(pop1,:) = CQ_SUMCOUNTS(pop1,:)-sumCq; + SP_COUNTS(:,:,pop1) = SP_COUNTS(:,:,pop1)-indSpCounts; + SP_SUMCOUNTS(pop1,:) = SP_SUMCOUNTS(pop1,:) - sumSp; + + CQ_COUNTS(:,:,pop2) = CQ_COUNTS(:,:,pop2)+indCqCounts; + CQ_SUMCOUNTS(pop2,:) = CQ_SUMCOUNTS(pop2,:)+sumCq; + SP_COUNTS(:,:,pop2) = SP_COUNTS(:,:,pop2)+indSpCounts; + SP_SUMCOUNTS(pop2,:) = SP_SUMCOUNTS(pop2,:) + sumSp; + + new_logmls = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + changes(i) = sum(new_logmls); + + CQ_COUNTS(:,:,pop1) = CQ_COUNTS(:,:,pop1)+indCqCounts; + CQ_SUMCOUNTS(pop1,:) = CQ_SUMCOUNTS(pop1,:)+sumCq; + SP_COUNTS(:,:,pop1) = SP_COUNTS(:,:,pop1)+indSpCounts; + SP_SUMCOUNTS(pop1,:) = SP_SUMCOUNTS(pop1,:)+sumSp; + CQ_COUNTS(:,:,pop2) = CQ_COUNTS(:,:,pop2)-indCqCounts; + CQ_SUMCOUNTS(pop2,:) = CQ_SUMCOUNTS(pop2,:)-sumCq; + SP_COUNTS(:,:,pop2) = SP_COUNTS(:,:,pop2)-indSpCounts; + SP_SUMCOUNTS(pop2,:) = SP_SUMCOUNTS(pop2,:)-sumSp; +end + +changes = changes - i1_logml - i2_logml; + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, indCqCounts, indSpCounts, ... + adjprior_cq, adjprior_sp) +% Updates global variables when individual ind is moved to population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+indCqCounts; +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+sumCq; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+indSpCounts; +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+sumSp; + + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2(i1, i2, adjprior_cq, adjprior_sp) +% Updates global variables when all individuals from population i1 are moved +% to population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; + +% inds = find(PARTITION==i1); +% PARTITION(inds) = i2; +PARTITION(logical(PARTITION==i1)) = i2; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+CQ_COUNTS(:,:,i1); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+CQ_SUMCOUNTS(i1,:); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+SP_COUNTS(:,:,i1); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+SP_SUMCOUNTS(i1,:); + +CQ_COUNTS(:,:,i1) = 0; +CQ_SUMCOUNTS(i1,:) = 0; +SP_COUNTS(:,:,i1) = 0; +SP_SUMCOUNTS(i1,:) = 0; + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, i2, indCqCounts, indSpCounts, ... + adjprior_cq, adjprior_sp) +% Updates global variables when individuals muuttuvat are moved to +% population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+indCqCounts; +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+sumCq; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+indSpCounts; +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+sumSp; + + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Returns individuals inds in order according to the change in the logml if +% they are moved out of the population pop + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind = inds(i); + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,pop) = CQ_COUNTS(:,:,pop)-indCqCounts; + CQ_SUMCOUNTS(pop,:) = CQ_SUMCOUNTS(pop,:)-sumCq; + SP_COUNTS(:,:,pop) = SP_COUNTS(:,:,pop)-indSpCounts; + SP_SUMCOUNTS(pop,:) = SP_SUMCOUNTS(pop,:)-sumSp; + + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior_cq, adjprior_sp); + + CQ_COUNTS(:,:,pop) = CQ_COUNTS(:,:,pop)+indCqCounts; + CQ_SUMCOUNTS(pop,:) = CQ_SUMCOUNTS(pop,:)+sumCq; + SP_COUNTS(:,:,pop) = SP_COUNTS(:,:,pop)+indSpCounts; + SP_SUMCOUNTS(pop,:) = SP_SUMCOUNTS(pop,:)+sumSp; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + + +%------------------------------------------------------------------------- + +function [Z, dist] = newGetDistances(data, rowsFromInd) + +ninds = max(data(:,end)); +nloci = size(data,2)-1; +riviLkm = nchoosek(double(ninds),2); + +% empties = find(data<0); +% data(empties)=0; +data(logical(data<0)) = 0; +data = uint16(data); + +pariTaulu = zeros(riviLkm,2); +aPointer=1; + +for a=1:ninds-1 + pariTaulu(aPointer:aPointer+double(ninds-1-a),1) = ones(ninds-a,1,'uint16')*a; + pariTaulu(aPointer:aPointer+double(ninds-1-a),2) = uint16((a+1:ninds)'); + aPointer = aPointer+double(ninds-a); +end + +eka = pariTaulu(:,ones(1,rowsFromInd)); +eka = eka * rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]); +eka = eka - miinus; + +toka = pariTaulu(:,ones(1,rowsFromInd)*2); +toka = toka * rowsFromInd; +toka = toka - miinus; + +eka = uint16(eka); +toka = uint16(toka); + +clear pariTaulu; clear miinus; + +summa = uint16(zeros(riviLkm,1)); +vertailuja = uint16(zeros(riviLkm,1)); + +x = zeros(size(eka)); x = uint16(x); +y = zeros(size(toka)); y = uint16(y); +% fprintf(1,'%%10'); +for j=1:nloci; + + for k=1:rowsFromInd + x(:,k) = data(eka(:,k),j); + y(:,k) = data(toka(:,k),j); + end + + for a=1:rowsFromInd + for b=1:rowsFromInd + vertailutNyt = uint16(x(:,a)>0 & y(:,b)>0); + vertailuja = vertailuja + vertailutNyt; + lisays = (x(:,a)~=y(:,b) & vertailutNyt); + summa = summa + uint16(lisays); + end + end + % fprintf(1,'\b\b'); + % fprintf(1,'%d',floor(10+80*j/nloci)); +end + +clear x; clear y; clear vertailutNyt; +clear eka; clear toka; clear data; clear lisays; +dist = zeros(length(vertailuja),1); +% nollat = find(vertailuja==0); +% dist(nollat) = 1; +dist(logical(vertailuja==0)) = 1; +muut = find(vertailuja>0); +dist(muut) = double(summa(muut))./double(vertailuja(muut)); +clear summa; clear vertailuja; clear muut; + +Z = computeLinkage(dist'); +% fprintf(1,'\b\b'); +% fprintf(1,'%d\n',100); +%-------------------------------------------------------------------------- + +function clearGlobalVars + +global CQ_COUNTS; CQ_COUNTS = []; +global CQ_SUMCOUNTS; CQ_SUMCOUNTS = []; +global SP_COUNTS; SP_COUNTS = []; +global SP_SUMCOUNTS; SP_SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; + +%-------------------------------------------------------------------------- + +function npops = removeEmptyPops +% Removes empty pops from all global COUNTS variables. +% Updates PARTITION and npops + +global CQ_COUNTS; +global CQ_SUMCOUNTS; +global SP_COUNTS; +global SP_SUMCOUNTS; +global PARTITION; + +notEmpty = find(any(CQ_SUMCOUNTS,2)); +CQ_COUNTS = CQ_COUNTS(:,:,notEmpty); +CQ_SUMCOUNTS = CQ_SUMCOUNTS(notEmpty,:); +SP_COUNTS = SP_COUNTS(:,:,notEmpty); +SP_SUMCOUNTS = SP_SUMCOUNTS(notEmpty,:); + +for n=1:length(notEmpty) +% apu = find(PARTITION==notEmpty(n)); +% PARTITION(apu)=n; +PARTITION(logical(PARTITION==notEmpty(n))) = n; +end +npops = length(notEmpty); + +%-------------------------------------------------------------------------- + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. +global PARTITION; +apu = isempty(find(abs(partitionSummary(:,2)-logml)<1e-5,1)); +if apu + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + +%-------------------------------------------------------------------------- +function [counts, sumcounts] = initialCounts(ind_counts, PARTITION) +% This version is for partition compare mode + +pops = unique(PARTITION); +npops = max(pops); + +counts = zeros(size(ind_counts,1), size(ind_counts,2), npops,'uint16'); +sumcounts = zeros(npops, size(ind_counts,2),'uint16'); + +for i = 1:npops + inds = find(PARTITION == i); + counts(:,:,i) = sum(ind_counts(:,:,inds), 3); + sumcounts(i,:) = sum(counts(:,:,i),1); +end +%-------------------------------------------------------------------------- + + +function logml = computeLogml(adjprior_cq, adjprior_sp, ... + CQ_COUNTS, CQ_SUMCOUNTS, ... + SP_COUNTS, SP_SUMCOUNTS) + + % for partition compare mode +cq_counts = double(CQ_COUNTS); +cq_sumcounts = double(CQ_SUMCOUNTS); +sp_counts = double(SP_COUNTS); +sp_sumcounts = double(SP_SUMCOUNTS); + +npops = size(CQ_COUNTS, 3); + +cq_logml = sum(sum(sum(gammaln(cq_counts+repmat(adjprior_cq,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior_cq))) - ... + sum(sum(gammaln(1+cq_sumcounts))); + +sp_logml = sum(sum(sum(gammaln(sp_counts+repmat(adjprior_sp,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior_sp))) - ... + sum(sum(gammaln(1+sp_sumcounts))); + +logml = cq_logml - sp_logml; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%-------------------------------------------------------------------------- + +function popLogml = computePopulationLogml(pops, adjprior_cq, adjprior_sp) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +cq_counts = double(CQ_COUNTS); +cq_sumcounts = double(CQ_SUMCOUNTS); +sp_counts = double(SP_COUNTS); +sp_sumcounts = double(SP_SUMCOUNTS); + +nall_cq = size(CQ_COUNTS,1); +nall_sp = size(SP_COUNTS, 1); +ncliq = size(CQ_COUNTS,2); +nsep = size(SP_COUNTS, 2); + +z = length(pops); + +popLogml_cq = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_cq,[1 1 z]) + cq_counts(:,:,pops)) ... + ,[nall_cq ncliq z]),1),2)) - sum(gammaln(1+cq_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_cq))); + +popLogml_sp = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_sp,[1 1 z]) + sp_counts(:,:,pops)) ... + ,[nall_sp nsep z]),1),2)) - sum(gammaln(1+sp_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_sp))); + +popLogml = popLogml_cq - popLogml_sp; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%------------------------------------------------------------------- + + +function changesInLogml = writeMixtureInfo(logml, counts_cq, counts_sp, adjprior_cq, ... + adjprior_sp, outPutFile, inputFile, partitionSummary, popnames, linkage_model,... + fixedK) + +global PARTITION; +global CQ_COUNTS; + +%global CQ_SUMCOUNTS; +%global SP_COUNTS; global SP_SUMCOUNTS; +ninds = length(PARTITION); +npops = size(CQ_COUNTS,3); +names = (size(popnames,1) == ninds); %Tarkistetaan ett?nimet viittaavat yksilöihin + +if length(outPutFile)>0 + fid = fopen(outPutFile,'a'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:'); +disp(['Data file/ Linkage map: ' inputFile]); +disp(['Model: Codon']); +disp(['Number of clustered individuals: ' ownNum2Str(ninds)]); +disp(['Number of groups in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%s \n', ['RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Data file: ' inputFile]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clustered individuals: ' ownNum2Str(ninds)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of groups in optimal partition: ' ownNum2Str(npops)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); fprintf(fid,'\n'); +end + +cluster_count = length(unique(PARTITION)); +disp('Best Partition: '); +if (fid ~= -1) + fprintf(fid,'%s \n','Best Partition: '); fprintf(fid,'\n'); +end +for m=1:cluster_count + indsInM = find(PARTITION==m); + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + if names + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + else + text = ['Cluster ' num2str(m) ': {' num2str(indsInM(1))]; + for k = 2:cluster_size + text = [text ', ' num2str(indsInM(k))]; + end; + end + text = [text '}']; + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',new_line); + fprintf(fid,'\n'); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',text); + fprintf(fid,'\n'); + end + end; +end + +if npops == 1 + changesInLogml = []; +else + disp(' '); + disp(' '); + disp('Changes in log(marginal likelihood) if indvidual i is moved to group j:'); + if (fid ~= -1) + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', 'Changes in log(marginal likelihood) if indvidual i is moved to group j:'); fprintf(fid, '\n'); + end + + if names + nameSizes = zeros(ninds,1); + for i = 1:ninds + nimi = char(popnames{i}); + nameSizes(i) = length(nimi); + end + maxSize = max(nameSizes); + maxSize = max(maxSize, 5); + erotus = maxSize - 5; + alku = blanks(erotus); + ekarivi = [alku ' ind' blanks(6+erotus)]; + else + ekarivi = ' ind '; + end + + for i = 1:cluster_count + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; + end + disp(ekarivi); + if (fid ~= -1) + fprintf(fid, '%s \n', ekarivi); fprintf(fid, '\n'); + end + + %ninds = size(data,1)/rowsFromInd; + changesInLogml = zeros(npops,ninds); + for ind = 1:ninds + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + changesInLogml(:,ind) = computeChanges(ind, adjprior_cq, ... + adjprior_sp, indCqCounts, indSpCounts); + + % transform the logml change to conditional posterior probabilities + % Added by Lu Cheng, 28.03.2010 + tmp_changelogml = exp(changesInLogml(:,ind)); + changesInLogml(:,ind) = tmp_changelogml ./ sum(tmp_changelogml); + clear tmp_changelogml; + %---------------------- + + if names + nimi = char(popnames{ind}); + rivi = [blanks(maxSize - length(nimi)) nimi ':']; + else + rivi = [blanks(4-floor(log10(ind))) ownNum2Str(ind) ':']; + end + for j = 1:npops + rivi = [rivi ' ' logml2String(omaRound(changesInLogml(j,ind)))]; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', rivi); fprintf(fid, '\n'); + end + end + + + % % KL-divergence has to be calculated otherwise... + % % { + % disp(' '); disp(' '); + % disp('KL-divergence matrix:'); + % + % if (fid ~= -1) + % fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + % fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + % fprintf(fid, '%s \n', ['KL-divergence matrix:']); fprintf(fid, '\n'); + % end + % + % maxnoalle = size(COUNTS,1); + % nloci = size(COUNTS,2); + % d = zeros(maxnoalle, nloci, npops); + % prior = adjprior; + % prior(find(prior==1))=0; + % nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. + % prior(1,nollia)=1; + % for pop1 = 1:npops + % d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + % dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); + % end + % ekarivi = blanks(7); + % for pop = 1:npops + % ekarivi = [ekarivi num2str(pop) blanks(7-floor(log10(pop)))]; + % end + % disp(ekarivi); + % if (fid ~= -1) + % fprintf(fid, '%s \n', [ekarivi]); fprintf(fid, '\n'); + % end + % + % for pop1 = 1:npops + % rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + % for pop2 = 1:pop1-1 + % dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + % div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + % div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + % div = (div12+div21)/2; + % rivi = [rivi kldiv2str(div) ' ']; + % end + % disp(rivi); + % if (fid ~= -1) + % fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + % end + % end + % % } +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', 'List of sizes of 10 best visited partitions and corresponding log(ml) values'); fprintf(fid, '\n'); +end + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +% partitionSummary = partitionSummary(find(partitionSummary(:,2)>-1e49),:); +partitionSummary = partitionSummary(logical(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1)) ' ' num2str(partitionSummary(part,2))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', line); fprintf(fid, '\n'); + end +end + +if ~fixedK + + disp(' '); + disp(' '); + disp('Probabilities for number of clusters'); + + if (fid ~= -1) + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', 'Probabilities for number of clusters'); fprintf(fid, '\n'); + end + + npopsTaulu = unique(partitionSummary(:,1)); + len = length(npopsTaulu); + probs = zeros(len,1); + partitionSummary(:,2) = partitionSummary(:,2)-max(partitionSummary(:,2)); + sumtn = sum(exp(partitionSummary(:,2))); + for i=1:len + % npopstn = sum(exp(partitionSummary(find(partitionSummary(:,1)==npopsTaulu(i)),2))); + npopstn = sum(exp(partitionSummary(logical(partitionSummary(:,1)==npopsTaulu(i)),2))); + probs(i) = npopstn / sumtn; + end + for i=1:len + if probs(i)>1e-5 + line = [num2str(npopsTaulu(i)) ' ' num2str(probs(i))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', line); fprintf(fid, '\n'); + end + end + end + +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + + +%-------------------------------------------------------------- +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +% newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) && n=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + +%------------------------------------------------------------------------- + +function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? +% kyseinen rivi on peräisin. Funktio tutkii ensin, ett?montako +% rivi?maksimissaan on peräisin yhdelt?yksilölt? jolloin saadaan +% tietää onko kyseess?haploidi, diploidi jne... Tämän jälkeen funktio +% lisää tyhji?rivej?niille yksilöille, joilta on peräisin vähemmän +% rivej?kuin maksimimäär? +% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin +% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytöss?oleva koodi. +% Tämän jälkeen funktio muuttaa alleelikoodit siten, ett?yhden lokuksen j +% koodit saavat arvoja välill?1,...,noalle(j). + +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +% dataApu = []; +% nollat = []; +% isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + %alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(logical(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + % data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + data(logical(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +nrows = size(data,1); +ncols = size(data,2); +rowsFromInd = zeros(nind,1); +for i=1:nind + rowsFromInd(i) = length(find(data(:,end)==i)); +end +maxRowsFromInd = max(rowsFromInd); +a = -999; +emptyRow = repmat(a, 1, ncols); +lessThanMax = find(rowsFromInd < maxRowsFromInd); +missingRows = maxRowsFromInd*nind - nrows; +data = [data; zeros(missingRows, ncols)]; +pointer = 1; +for ind=lessThanMax' %Käy läpi ne yksilöt, joilta puuttuu rivej? + miss = maxRowsFromInd-rowsFromInd(ind); % Tält?yksilölt?puuttuvien lkm. + for j=1:miss + rowToBeAdded = emptyRow; + rowToBeAdded(end) = ind; + data(nrows+pointer, :) = rowToBeAdded; + pointer = pointer+1; + end +end +data = sortrows(data, ncols); % Sorttaa yksilöiden mukaisesti +newData = data; +rowsFromInd = maxRowsFromInd; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhji? +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end + +%-------------------------------------------------------------------------- + +function popnames = fixPopnames(popnames, ninds) + +if length(popnames) == ninds + for i=1:ninds + if isnumeric(popnames{i}) + popnames{i} = num2str(popnames{i}); + % popnames(i) = num2str(popnames{i}); + end + popnames{i} = cellstr(popnames{i}); + % popnames(i) = cellstr(popnames{i}); + end +end + +%-------------------------------------------------------------------------- +function isRational = isTheLoadedNewDataRational(data) +% The last column of the data must include numbers 1-npops +% If so, isRational = 1, otherwise isRational = 0. +% The row numbers must be larger than 1. +if size(data,1) == 1 + isRational = 0; + display('*** ERROR: Sample size must be larger than one'); + return; +end +last_column = data(:,end); +last_column = sort(last_column); +current = 1; +if last_column(1) ~= current + isRational = 0; + display('*** ERROR: Wrong Indexes in the data'); + return; +end; +lengthcol = length(last_column); +for n = 2:lengthcol + if ~(last_column(n) == current || last_column(n) == current + 1) + %Some population is missing from the last column + isRational = 0; + display('*** ERROR: Missing indexes in the data'); + return; + end; + current = last_column(n); +end; +isRational = 1; + + +% %------------------------------------------------------------------------- +% function isRational = isTheLoadedNewLinkageRational(linkage_data) +% % Each positive element must be unique. +% % If so, isRational = 1, otherwise isRational = 0; +% nonzero = find(linkage_data~=0); +% dif = diff(linkage_data(nonzero)); +% if ~all(dif) +% isRational = 0; return; +% end; +% isRational = 1; + +%-------------------------------------------------------------------------- + +function [sumcounts, counts] = ... + indLociCounts(partition, data, npops, noalle) + +nloci=size(data,2)-1; +% ninds = size(data,1); + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + % havainnotLokuksessa = find(partition==i & data(:,j)>=0); + havainnotLokuksessa = find(ismember(data(:,end),find(partition==i))); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile, indexFile) +%Palauttaa tyhjän, mikäli nimitiedosto ja indeksitiedosto +% eivät olleet yht?pitki? + +popnames = []; +try +indices = load(indexFile); +catch + msgbox('Loading of the index file was unsuccessful', ... + 'Error', 'error'); + return +end +fid = fopen(nameFile); +if fid == -1 + % File does not exist + msgbox('Loading of the name file was unsuccessful', ... + 'Error', 'error'); + return; +end + +line = fgetl(fid); +counter = 1; + +while sum(line~=-1) && ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +if length(names) ~= length(indices) + disp('The number of population names must be equal to the number of '); + disp('entries in the file specifying indices of the first individuals of '); + disp('each population.'); + return; +end + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = indices(i); +end diff --git a/matlab/linkage/makecomplete.m b/matlab/linkage/makecomplete.m new file mode 100644 index 0000000..a5c28b6 --- /dev/null +++ b/matlab/linkage/makecomplete.m @@ -0,0 +1,70 @@ +function [data, totalmissing] = makecomplete(data) +%MAKECOMPLETE ESTIMATING missing alleles in a loci. +% Input: +% data: a baps data format, e.g. last column is index. Missing values +% are denoted as any non-positive integers, commonly as 0 or -999. + + +[nrows, ncols] = size(data); +id = all(data>0); +if all(id) % no missing values + return +else + fprintf(1,'Estimating the missing values...'); + + % remove the totally missing loci + missingloci = find(id==0); + totalmissing = find(all(data(:,missingloci)<=0)); + if ~isempty(totalmissing) + disp('Totally missing loci were found.'); + data(:,missingloci(totalmissing)) = ones(nrows,length(totalmissing)); + end +% goodloci = setdiff([1:ncols],missingloci(totalmissing)); +% data = data(:,goodloci); +% missingloci = find(all(data>0)==0); + + data_in = data(:,[missingloci end]); + [counts,noalle,prior,adjprior,rawalleles,data_in] = allfreqsnew3(data_in); + data(:,missingloci) = data_in(:,[1:end-1]); + + s_counts = sum(counts,3); +% m_counts = size(counts,3)*ones(1, size(counts,2)) - sum(s_counts); % counts of missing values + m_counts = nrows*ones(1, size(counts,2)) - sum(s_counts); % counts of missing values + m = repmat(m_counts,size(s_counts,1),1); + a = s_counts + prior; + a = a./repmat(sum(a),size(a,1),1); + c = round(m.*a); + + % Be sure that the sum matches + reassign = find(sum(c)~=m_counts); + for i = reassign + dif = sum(c(:,i)) - m_counts(:,i); + if dif > 0 + remove = find(c(:,i)==max(c(:,i))); + c(remove(1),i)=c(remove(1),i) - dif; % decrease by the diff + else + add = find(c(:,i)==min(c(:,i))); + c(add(1),i)=c(add(1),i) - dif; % increase by the diff + end + end + try + % Fill in the missing values + for i = 1:length(missingloci) + missingrows = find(data(:,missingloci(i))<=0); + p = randperm(m_counts(i)); + cum = [0 cumsum(c(:,i))']; + for j = 1:size(c,1) + data(missingrows(p([cum(j)+1:cum(j+1)])),missingloci(i)) = j; + end + end + catch + % disp('*** ERROR: in completing the missing data.'); + data = []; + totalmissing = []; + return + end + fprintf(1,'Finished.\n'); +end + + + diff --git a/matlab/linkage/myCell2mat.m b/matlab/linkage/myCell2mat.m new file mode 100644 index 0000000..a6f8baa --- /dev/null +++ b/matlab/linkage/myCell2mat.m @@ -0,0 +1,120 @@ +function m = myCell2mat(c) +%CELL2MAT Convert the contents of a cell array into a single matrix. +% M = CELL2MAT(C) converts a multidimensional cell array with contents of +% the same data type into a single matrix. The contents of C must be able +% to concatenate into a hyperrectangle. Moreover, for each pair of +% neighboring cells, the dimensions of the cell's contents must match, +% excluding the dimension in which the cells are neighbors. This constraint +% must hold true for neighboring cells along all of the cell array's +% dimensions. +% +% The dimensionality of M, i.e. the number of dimensions of M, will match +% the highest dimensionality contained in the cell array. +% +% CELL2MAT is not supported for cell arrays containing cell arrays or +% objects. +% +% Example: +% C = {[1] [2 3 4]; [5; 9] [6 7 8; 10 11 12]}; +% M = cell2mat(C) +% +% See also MAT2CELL, NUM2CELL + +% Copyright 1984-2006 The MathWorks, Inc. +% $Revision: 1.10.4.6 $ $Date: 2006/06/20 20:09:38 $ + +% Error out if there is no input argument +if nargin==0 + error('MATLAB:cell2mat:NoInputs',['No input argument specified. ' ... + 'There should be exactly one input argument.']) +end +% short circuit for simplest case +elements = numel(c); +if elements == 0 + m = []; + return +end +if elements == 1 + if isnumeric(c{1}) || ischar(c{1}) || islogical(c{1}) || isstruct(c{1}) + m = c{1}; + return + end +end +% Error out if cell array contains mixed data types +cellclass = class(c{1}); +ciscellclass = cellfun('isclass',c,cellclass); +if ~all(ciscellclass(:)) + error('MATLAB:cell2mat:MixedDataTypes', ... + 'All contents of the input cell array must be of the same data type.'); +end + +% Error out if cell array contains any cell arrays or objects +ciscell = iscell(c{1}); +cisobj = isobject(c{1}); +if cisobj || ciscell + error('MATLAB:cell2mat:UnsupportedCellContent',['Cannot support cell ' ... + 'arrays containing cell arrays or objects.']); +end + +% If cell array of structures, make sure the field names are all the same +if isstruct(c{1}) + cfields = cell(elements,1); + for n=1:elements + cfields{n} = fieldnames(c{n}); + end + % Perform the actual field name equality test + if ~isequal(cfields{:}) + error('MATLAB:cell2mat:InconsistentFieldNames', ... + ['The field names of each cell array element must be consistent ' ... + 'and in consistent order.']) + end +end + +% If cell array is 2-D, execute 2-D code for speed efficiency +if ndims(c) == 2 + rows = size(c,1); + m = cell(rows,1); + % Concatenate each row first + for n=1:rows + m{n} = cat(3,c{n,:}); + end + % Now concatenate the single column of cells into a matrix + m = cat(3,m{:}); + return +end + +csize = size(c); +% Treat 3+ dimension arrays + +% Construct the matrix by concatenating each dimension of the cell array into +% a temporary cell array, CT +% The exterior loop iterates one time less than the number of dimensions, +% and the final dimension (dimension 1) concatenation occurs after the loops + +% Loop through the cell array dimensions in reverse order to perform the +% sequential concatenations +for cdim=(length(csize)-1):-1:1 + % Pre-calculated outside the next loop for efficiency + ct = cell([csize(1:cdim) 1]); + cts = size(ct); + ctsl = length(cts); + mref = {}; + + % Concatenate the dimension, (CDIM+1), at each element in the temporary cell + % array, CT + for mind=1:prod(cts) + [mref{1:ctsl}] = ind2sub(cts,mind); + % Treat a size [N 1] array as size [N], since this is how the indices + % are found to calculate CT + if ctsl==2 && cts(2)==1 + mref = {mref{1}}; + end + % Perform the concatenation along the (CDIM+1) dimension + ct{mref{:}} = cat(cdim+1,c{mref{:},:}); + end + % Replace M with the new temporarily concatenated cell array, CT + c = ct; +end + +% Finally, concatenate the final rows of cells into a matrix +m = cat(1,c{:}); diff --git a/matlab/linkage/processprofile.m b/matlab/linkage/processprofile.m new file mode 100644 index 0000000..fc34f15 --- /dev/null +++ b/matlab/linkage/processprofile.m @@ -0,0 +1,35 @@ +function Output = processprofile(root) + +%Open File and count the number of rows in the file +fid=fopen(root); +nRows=0; +while 1 + iString=fgetl(fid); + if ~ischar(iString) + break + end + nRows=nRows+1; +end + +%Return to beginning of file +fseek(fid,0,'bof'); + +%For each row, assign each space delimitted object to a cell in the "Output" matrix +for iRow=1:nRows + iCol=1; + %Temporary storage of the first object + % Note: the space delimitter used here can be replaced by any delimitter + [TempOutput,Rem]=strtok(fgetl(fid),sprintf('\t')); + %If there is now data on this row, then assign the first object to be an underscore + if (length(TempOutput) == 0) + TempOutput='_'; + end + %Build the "Output" matrix this will be the first column of the iRow-th row + Output(iRow,iCol)=cellstr(TempOutput); + %Repeat this only using Rem as the total string and incrementing the iCol counter + while length(Rem) > 0 + iCol=iCol+1; + [TempOutput,Rem]=strtok(Rem,sprintf('\t')); + Output(iRow,iCol)=cellstr(TempOutput); + end +end \ No newline at end of file diff --git a/matlab/linkage/processxls.m b/matlab/linkage/processxls.m new file mode 100644 index 0000000..580fd4b --- /dev/null +++ b/matlab/linkage/processxls.m @@ -0,0 +1,77 @@ +function [data, component_mat, popnames] = processxls(filename) +% +% a bug in line 64-68 was fixed +data = []; +component_mat = []; +popnames = []; +try + [A,B] = xlsread(filename); +catch + display('*** ERROR: Wrong Excel format'); + return +end + +if size(A,2)~=1 % more than one columns containing numeric ST values + display('*** ERROR: multiple columns of numeric values'); + data = []; component_mat = []; popnames = []; + return +end + +if size(A,1)~=size(B,1)-1 + display('*** ERROR: Wrong format'); + data = []; component_mat = []; popnames = []; + return +end + +B = deblank(B); % remove any trailing blanks +nstrains = size(B,1)-1; +nheader = size(B,2); +for i = 1:nheader + if strcmpi('ST',B{1,i}) ix_ST = i; end + if strcmpi('Strain', B{1,i}) || strcmpi('Isolate',B{1,i}) + ix_Strain = i; + end +end +if ~exist('ix_ST') + display('*** ERROR: ST column needed'); + data = []; component_mat = []; popnames = []; + return +end + +if ~exist('ix_Strain') + ix_gene = setdiff([1:nheader],ix_ST); +else + ix_gene = setdiff([1:nheader],[ix_ST ix_Strain]); +end + +ngenes = length(ix_gene); + +C = cell(nstrains,ngenes); +if ~isempty(A) + for i=1:nstrains + B{i+1,ix_ST}=num2str(A(i)); + for j=1:ngenes + C{i,j}=uint16(i_encode_n(B{i+1,ix_gene(j)})); % save the memory. + end + end +end +genesize=cellfun('size',C(1,:),2); +data=cell2mat(C); +data=[data uint16([1:nstrains]')]; +component_mat = zeros(ngenes,max(genesize)); +cum = cumsum(genesize); +component_mat(1,[1:genesize(1)]) = [1:cum(1)]; +for i=2:ngenes + component_mat(i,[1:genesize(i)]) = [(cum(i-1)+1):cum(i)]; +end + +if ~exist('ix_Strain') + popnames = num2cell(B([2:end],ix_ST)); +else % store the strain names only + popnames = num2cell(B([2:end],ix_Strain)); +end +popnames(:,2)=num2cell([1:nstrains]'); + +display('---------------------------------------------------'); +display(['# of strains: ', num2str(nstrains)]); +display(['# of genes: ', num2str(ngenes)]); \ No newline at end of file diff --git a/matlab/linkage/readbaps.m b/matlab/linkage/readbaps.m new file mode 100644 index 0000000..cd54f20 --- /dev/null +++ b/matlab/linkage/readbaps.m @@ -0,0 +1,188 @@ +function [data, filename] = readbaps(varargin) +MAXNAME = 200; + +if nargin == 0 + [filename, pathname] = uigetfile( ... + {'*.txt', 'BAPS Sequence Files (*.txt)'; + '*.*', 'All Files (*.*)'}, ... + 'Load BAPS sequence data'); + if ~(filename), data=[]; return; end + filename=[pathname,filename]; +end + +if nargin == 1 + [filename,pathname] = uigetfile( ... + {'*.txt', 'BAPS Sequence Files (*.txt)'; + '*.*', 'All Files (*.*)'}, ... + sprintf('Load the BAPS sequence data for gene %s',varargin{1}) ); + if ~(filename), data = []; + return; + end + filename=[pathname,filename]; +end + +if nargin == 2 + [filename,pathname] = uigetfile( ... + {'*.txt', 'BAPS Sequence Files (*.txt)'; + '*.*', 'All Files (*.*)'}, ... + sprintf('Load the BAPS sequence file for gene %s',varargin{1}) ); + if ~(filename),data = []; + return; + end + filename=[pathname,filename]; + chosen_index = varargin{2}; +end + +if nargin < 3 + % [seqtype, geneticcode]=selectSeqTypeAndGeneticCode; + seqtype = 2; + geneticcode = 1; + if (isempty(seqtype)|isempty(geneticcode)), data=[]; return; end +end + + +pause(0.0001); +if ~ischar(filename) + error('BAPS:InvalidInput','Input must be a character array') + data = []; + return; +end + +if ~(exist(filename,'file') | exist(fullfile(cd,filename),'file')), + % is a valid filename ? + error('BAPS:InvalidInput','Input must be a valid file') + data = []; + return; +end + +file = fopen(filename, 'r'); +display('---------------------------------------------------'); +display(['Reading BAPS sequence data from: ',filename,'...']); +display('---------------------------------------------------'); +% Now we are looking for the maximum length of the sequence +n=0; % the number of sequences +m=0; % the maximum length +cm = 0; % current sequence length + +while 1 + [x,nr] = fscanf(file,'%c',1); + if nr == 0 break; end; + if x==' ' % new sequence started + if cm ~=m & m >0 + % fprintf(['*** ERROR: Different sequence length found in allelic ','%d','.\n'],n+1); + disp('***ERROR: Incorrect BAPS sequence data.'); + data = []; + fclose(file); + return; + end + if cm > m m=cm; end; + cm = 0; + fgets(file); + n=n+1; + else + if isletter(x) | x=='-' | x == '?' + cm=cm+1; + end; + end; +end + +if cm > m m=cm; end; + +% go throught the file +if (m==0 | n==0) + % display(['*** ERROR: Unmatched data for gene ' varargin{1}]); + disp('***ERROR: Incorrect BAPS sequence data.'); + data = []; + fclose(file); + return; +end + +Ss = char(m); S = []; +str = zeros(1,MAXNAME); +sizes = zeros(1,n); +frewind(file); +% names=[]; +names={}; +i=1;j=1; +id = 0; +while 1 + [x,nr] = fscanf(file,'%c',1); + if nr == 0 + break; + end; + if x==' ' % new sequence started + if i~= 0 % save the sequence + % str=x; + [x, sizes(i)]=size(Ss); + S=strvcat(S,Ss); + Ss = []; Ss = char(m); + end; + str=fgetl(file); % read the name, we remove the '>' symbol + % names=strvcat(names,str); + % pos=find(str==' '); + % if ~(isempty(pos)) + % str=str(1:pos(1,1)); + % end + names{i}=str2num(str); + i=i+1; + + if nargin == 1 + if isempty(findstr(str, varargin{1})) + display(['*** ERROR: Unmatched data for gene ' varargin{1}]); + data = []; + fclose(file); + return + end + end + % disp(['Processing in: ' str]); + id = id + 1; + j=1; + else + if isletter(x) | x== '-' | x=='?' + % processing the sequence symbol + Ss(j) = upper(x); + j=j+1; + end; + end; +end +% S=strvcat(S,Ss); +[x, sizes(i)]=size(Ss); +if ~isempty(find(S==' ')) + disp('***ERROR: unequal sequence length.'); + data = []; + fclose(file); + return +end + +if exist('chosen_index','var') + S = S(chosen_index,:); + names = names(chosen_index); +end +aln.seqtype = seqtype; +aln.geneticcode = geneticcode; +aln.seqnames = names; +aln.seq = S; +aln = encodealn(aln); +data = aln.seq; +% if nargin == 1 +% if isempty(findstr(names{1},varargin{1})) +% disp(['*** ERROR: The file does not contain the required gene ' varargin{1}]); +% data = []; +% return; +% end +% end +% display(['# of allelic types: ' num2str(size(aln.seq,1))]); +% display(['# of nucleotides: ' num2str(size(aln.seq,2))]); +% ---------------------- +% order_index = [1:size(aln.seq,1)]'; +% data = [aln.seq order_index]; % Append the index column in the end. +% ---------------------- +try, +data = [data cell2mat(names)']; +catch, + disp('*** ERROR: Failed in loading the BAPS data.'); + disp('*** ERROR: Inidividual indices are not numerically sequential.'); + data = []; +end +fclose(file); + diff --git a/matlab/linkage/readfasta.m b/matlab/linkage/readfasta.m new file mode 100644 index 0000000..5fadcc4 --- /dev/null +++ b/matlab/linkage/readfasta.m @@ -0,0 +1,171 @@ +function data = readfasta(varargin) +MAXNAME = 200; + +if nargin == 0 + [filename, pathname] = uigetfile( ... + {'*.fasta;*.fas;*.txt', 'FASTA Format Files (*.fasta, *.fas, *.txt)'; + '*.*', 'All Files (*.*)'}, ... + 'Pick a FASTA file'); + if ~(filename), aln=[]; return; end + filename=[pathname,filename]; +end + +if nargin == 1 + [filename,pathname] = uigetfile( ... + {'*.fasta;*.fas;*.txt', 'FASTA Format Files (*.fasta, *.fas, *.txt)'; + '*.*', 'All Files (*.*)'}, ... + sprintf('Pick the FASTA file for gene %s',varargin{1}) ); + if ~(filename), aln=[]; + data = []; + return; + end + filename=[pathname,filename]; +end + +if nargin == 2 + [filename,pathname] = uigetfile( ... + {'*.fasta;*.fas;*.txt', 'FASTA Format Files (*.fasta, *.fas, *.txt)'; + '*.*', 'All Files (*.*)'}, ... + sprintf('Pick the FASTA file for gene %s',varargin{1}) ); + if ~(filename), aln=[]; + data = []; + return; + end + filename=[pathname,filename]; + chosen_index = varargin{2}; +end + +if nargin < 3 + % [seqtype, geneticcode]=selectSeqTypeAndGeneticCode; + seqtype = 2; + geneticcode = 1; + if (isempty(seqtype)|isempty(geneticcode)), aln=[],data=[]; return; end +end + + +pause(0.0001); +if ~ischar(filename) + error('BAPS:InvalidInput','Input must be a character array') + data = []; + return; +end + +if ~(exist(filename,'file') | exist(fullfile(cd,filename),'file')), + % is a valid filename ? + error('BAPS:InvalidInput','Input must be a valid file') + data = []; + return; +end + +file = fopen(filename, 'r'); +display('---------------------------------------------------'); +display(['Reading fasta sequence from: ',filename,'...']); +display('---------------------------------------------------'); +% Now we are looking for the maximum length of the sequence +n=0; % the number of sequences +m=0; % the maximum length +cm = 0; % current sequence length + +while 1 + [x,nr] = fscanf(file,'%c',1); + if nr == 0 break; end; + if x =='>' % new sequence started + if cm ~=m & m >0 + fprintf(['*** ERROR: Different sequence length found in allelic ','%d','.\n'],n+1); + data = []; + return; + end + if cm > m m=cm; end; + cm = 0; + fgets(file); + n=n+1; + else + if isletter(x) | x=='-' | x == '?' + cm=cm+1; + end; + end; +end + +if cm > m m=cm; end; + +% go throught the file +if (m==0 | n==0) + display(['*** ERROR: Unmatched data for gene ' varargin{1}]); + data = []; + return; +end + +Ss = char(m); S = []; +str = zeros(1,MAXNAME); +sizes = zeros(1,n); +frewind(file); +% names=[]; +names={}; +i=0;j=1; +id = 0; +while 1 + [x,nr] = fscanf(file,'%c',1); + if nr == 0 + break; + end; + if x =='>' % new sequence started + if i~= 0 % save the sequence + [x, sizes(i)]=size(Ss); + S=strvcat(S,Ss); + Ss = []; Ss = char(m); + end; + str=fgetl(file); % read the name, we remove the '>' symbol + % names=strvcat(names,str); + % pos=find(str==' '); + % if ~(isempty(pos)) + % str=str(1:pos(1,1)); + % end + i=i+1; + names{i}=str; + + if nargin == 1 + if isempty(findstr(str, varargin{1})) + display(['*** ERROR: Unmatched data for gene ' varargin{1}]); + data = []; + return + end + end + % disp(['Processing in: ' str]); + id = id + 1; + j=1; + else + if isletter(x) | x== '-' | x=='?' + % processing the sequence symbol + Ss(j) = upper(x); + j=j+1; + end; + end; +end +S=strvcat(S,Ss); +[x, sizes(i)]=size(Ss); +if exist('chosen_index','var') + S = S(chosen_index,:); + names = names(chosen_index); +end +aln.seqtype = seqtype; +aln.geneticcode = geneticcode; +aln.seqnames = names; +aln.seq = S; +aln = encodealn(aln); +data = aln.seq; +% if nargin == 1 +% if isempty(findstr(names{1},varargin{1})) +% disp(['*** ERROR: The file does not contain the required gene ' varargin{1}]); +% data = []; +% return; +% end +% end +display(['# of allelic types: ' num2str(size(aln.seq,1))]); +display(['# of nucleotides: ' num2str(size(aln.seq,2))]); +% ---------------------- +% order_index = [1:size(aln.seq,1)]'; +% data = [aln.seq order_index]; % Append the index column in the end. +% ---------------------- + +fclose(file); + diff --git a/matlab/linkage/selectDataType.m b/matlab/linkage/selectDataType.m new file mode 100644 index 0000000..3fb4139 --- /dev/null +++ b/matlab/linkage/selectDataType.m @@ -0,0 +1,37 @@ +function [isOK, datatype] = selectDataType() + +items(1).name = 'MLST DATA'; +items(1).default = 1; +items(1).linked = [2 3]; +items(1).exclusive = [4 5 6]; +items(1).values = {1}; + +items(2).name = '1 = Allelic profile'; +items(2).default = 1; +items(2).exclusive = [3 4 5 6]; +items(2).indent = 1; + +items(3).name = '2 = FASTA-format'; +items(3).default = 0; +items(3).exclusive = [2 4 5 6]; +items(3).indent = 1; + + +title = 'STEP 1'; +msg = sprintf(['Please specify the data format:']); +out = CSEFlagDialog(items, title, msg); +if ~(isempty(out)), + if(out(2).answer==1) + datatype = 1; + elseif(out(3).answer==1) + datatype = 2; + elseif(out(5).answer==1) + datatype = 3; + elseif(out(6).answer==1) + datatype = 4; + end + isOK = 1; +else + isOK = 0; + datatype = 0; +end \ No newline at end of file diff --git a/matlab/linkage/selectGene.m b/matlab/linkage/selectGene.m new file mode 100644 index 0000000..245527d --- /dev/null +++ b/matlab/linkage/selectGene.m @@ -0,0 +1,25 @@ +function [isOK, genename] = selectGene(genename) +ngenes = size(genename,1); +for i=1:ngenes + items(i).name = genename{i}; + items(i).default = 1; + items(i).values = []; +end + +title = 'STEP 3'; +nGenestr = sprintf('%d',ngenes); +msg = sprintf(['The allelic profile contains ' nGenestr ' genes named below.\n'... + 'Select the individual sequence data that you want to load.\n'... + 'It is recommended that all the genes are selected.']); +out = CSEFlagDialog(items, title, msg); +if ~(isempty(out)), + for i=1:ngenes + if ~out(i).answer + genename{i}=[]; + end + end + isOK = 1; +else + isOK = 0; + genename = {[]}; +end \ No newline at end of file diff --git a/matlab/linkage/selectSeqType.m b/matlab/linkage/selectSeqType.m new file mode 100644 index 0000000..3589d99 --- /dev/null +++ b/matlab/linkage/selectSeqType.m @@ -0,0 +1,47 @@ +function nextstep = selectSeqType() + +items(1).name = 'SEQUENCE TYPE:'; +items(1).default = 1; +items(1).linked = [2 3 4]; +items(1).values = []; + +items(2).name = '1 = Non-coding nucleotide'; +items(2).default = 0; +items(2).exclusive = [3 4]; +items(2).indent = 1; + +items(3).name = '2 = Coding nucleotide'; +items(3).default = 1; +items(3).exclusive = [2 4]; +items(3).indent = 1; + +items(4).name = '3 = Protein'; +items(4).default = 0; +items(4).exclusive = [2 3]; +items(4).indent = 1; + +items(5).name = 'GENETIC CODE: '; +items(5).default = 1; +items(5).indent = 0; +items(5).values = {'1 - Standard';'2 - Vertebrate Mithchondrial'; ... + '3 - Yeast Mithchondrial';'4 - Mold Mithchondrial'; ... + '5 - Invertebrate Mithchondrial';'6 - Protozoan Mithchondrial'; ... + '7 - Coelenterate Mithchondrial';'8 - Mycoplasma'}; +items(5).help = 'Please select a genetic code'; + +title = 'Sequence type and genetic code'; +% msg = sprintf(['Please select sequence type and genetic code']); +out = CSEFlagDialog(items, title); +if ~(isempty(out)), + if(out(2).answer==1) + seqtype=1; + elseif(out(3).answer==1) + seqtype=2; + elseif(out(4).answer==1) + seqtype=3; + end + geneticcode=out(5).answer; +else + seqtype=[]; + geneticcode=[]; +end \ No newline at end of file diff --git a/matlab/linkage/seqcode.m b/matlab/linkage/seqcode.m new file mode 100644 index 0000000..62a5f92 --- /dev/null +++ b/matlab/linkage/seqcode.m @@ -0,0 +1,20 @@ +function [NT,AA] = seqcode() +%SEQCODE - Return vector for mapping sequence letters to integers +% +% Syntax: [NT,AA] = seqcode +% +% See also: + +% Molecular Biology & Evolution Toolbox, (C) 2005 +% Author: James J. Cai +% Email: jamescai@hkusua.hku.hk +% Website: http://web.hku.hk/~jamescai/ +% Last revision: 5/28/2005 + +NT = 'ACGTDI?-'; +if (nargout>1) +AA = 'ARNDCQEGHILKMFPSTWYV*-'; +end + +% AANames = {'ala' 'arg' 'asn' 'asp' 'cys' 'gln' 'glu' 'gly' 'his' 'ile' 'leu' 'lys' 'met' ... +% 'phe' 'pro' 'ser' 'thr' 'trp' 'tyr' 'val'}; \ No newline at end of file diff --git a/matlab/linkage/silentReadBaps.m b/matlab/linkage/silentReadBaps.m new file mode 100644 index 0000000..a045aae --- /dev/null +++ b/matlab/linkage/silentReadBaps.m @@ -0,0 +1,193 @@ +function [data, filename] = silentReadBaps(varargin) +% This function is modified from readbaps.m to avoid invoke a dialog in Linux +% The input must be a vaild BAPS sequence file +% modified by Lu Cheng, 29.06.2010 + +MAXNAME = 200; + +if nargin == 0 + [filename, pathname] = uigetfile( ... + {'*.txt', 'BAPS Sequence Files (*.txt)'; + '*.*', 'All Files (*.*)'}, ... + 'Load BAPS sequence data'); + if ~(filename), data=[]; return; end + filename=[pathname,filename]; +end + +if nargin == 1 +% [filename,pathname] = uigetfile( ... +% {'*.txt', 'BAPS Sequence Files (*.txt)'; +% '*.*', 'All Files (*.*)'}, ... +% sprintf('Load the BAPS sequence data for gene %s',varargin{1}) ); +% if ~(filename), data = []; +% return; +% end +% filename = [pathname,filename]; + filename=varargin{1}; % added by Lu Cheng, 29.06.2010 +end + +if nargin == 2 + [filename,pathname] = uigetfile( ... + {'*.txt', 'BAPS Sequence Files (*.txt)'; + '*.*', 'All Files (*.*)'}, ... + sprintf('Load the BAPS sequence file for gene %s',varargin{1}) ); + if ~(filename),data = []; + return; + end + filename=[pathname,filename]; + chosen_index = varargin{2}; +end + +if nargin < 3 + % [seqtype, geneticcode]=selectSeqTypeAndGeneticCode; + seqtype = 2; + geneticcode = 1; + if (isempty(seqtype)|isempty(geneticcode)), data=[]; return; end +end + + +pause(0.0001); +if ~ischar(filename) + error('BAPS:InvalidInput','Input must be a character array') + data = []; + return; +end + +if ~(exist(filename,'file') | exist(fullfile(cd,filename),'file')), + % is a valid filename ? + error('BAPS:InvalidInput','Input must be a valid file') + data = []; + return; +end + +file = fopen(filename, 'r'); +display('---------------------------------------------------'); +display(['Reading BAPS sequence data from: ',filename,'...']); +display('---------------------------------------------------'); +% Now we are looking for the maximum length of the sequence +n=0; % the number of sequences +m=0; % the maximum length +cm = 0; % current sequence length + +while 1 + [x,nr] = fscanf(file,'%c',1); + if nr == 0 break; end; + if x==' ' % new sequence started + if cm ~=m & m >0 + % fprintf(['*** ERROR: Different sequence length found in allelic ','%d','.\n'],n+1); + disp('***ERROR: Incorrect BAPS sequence data.'); + data = []; + fclose(file); + return; + end + if cm > m m=cm; end; + cm = 0; + fgets(file); + n=n+1; + else + if isletter(x) | x=='-' | x == '?' + cm=cm+1; + end; + end; +end + +if cm > m m=cm; end; + +% go throught the file +if (m==0 | n==0) + % display(['*** ERROR: Unmatched data for gene ' varargin{1}]); + disp('***ERROR: Incorrect BAPS sequence data.'); + data = []; + fclose(file); + return; +end + +Ss = char(m); S = []; +str = zeros(1,MAXNAME); +sizes = zeros(1,n); +frewind(file); +% names=[]; +names={}; +i=1;j=1; +id = 0; +while 1 + [x,nr] = fscanf(file,'%c',1); + if nr == 0 + break; + end; + if x==' ' % new sequence started + if i~= 0 % save the sequence + % str=x; + [x, sizes(i)]=size(Ss); + S=strvcat(S,Ss); + Ss = []; Ss = char(m); + end; + str=fgetl(file); % read the name, we remove the '>' symbol + % names=strvcat(names,str); + % pos=find(str==' '); + % if ~(isempty(pos)) + % str=str(1:pos(1,1)); + % end + names{i}=str2num(str); + i=i+1; + %if nargin == 1 + if nargin == 1000 % modified by Lu Cheng, 29.06.2010 + if isempty(findstr(str, varargin{1})) + display(['*** ERROR: Unmatched data for gene ' varargin{1}]); + data = []; + fclose(file); + return + end + end + % disp(['Processing in: ' str]); + id = id + 1; + j=1; + else + if isletter(x) | x== '-' | x=='?' + % processing the sequence symbol + Ss(j) = upper(x); + j=j+1; + end; + end; +end +% S=strvcat(S,Ss); +[x, sizes(i)]=size(Ss); +if ~isempty(find(S==' ')) + disp('***ERROR: unequal sequence length.'); + data = []; + fclose(file); + return +end + +if exist('chosen_index','var') + S = S(chosen_index,:); + names = names(chosen_index); +end +aln.seqtype = seqtype; +aln.geneticcode = geneticcode; +aln.seqnames = names; +aln.seq = S; +aln = encodealn(aln); +data = aln.seq; +% if nargin == 1 +% if isempty(findstr(names{1},varargin{1})) +% disp(['*** ERROR: The file does not contain the required gene ' varargin{1}]); +% data = []; +% return; +% end +% end +% display(['# of allelic types: ' num2str(size(aln.seq,1))]); +% display(['# of nucleotides: ' num2str(size(aln.seq,2))]); +% ---------------------- +% order_index = [1:size(aln.seq,1)]'; +% data = [aln.seq order_index]; % Append the index column in the end. +% ---------------------- +try, +data = [data cell2mat(names)']; +catch, + disp('*** ERROR: Failed in loading the BAPS data.'); + disp('*** ERROR: Inidividual indices are not numerically sequential.'); + data = []; +end +fclose(file); + diff --git a/matlab/linkage/sumCell.m b/matlab/linkage/sumCell.m new file mode 100644 index 0000000..f3abd6f --- /dev/null +++ b/matlab/linkage/sumCell.m @@ -0,0 +1,10 @@ +function summat = sumCell(counts) +nbatches = size(counts,2); +sumcell = cell(1,nbatches); +[nalleles, nloci, ninds] = size(counts{1}); +summat = zeros(nalleles, nloci,'uint16'); +for i = 1:nbatches + sumcell{i} = uint16(sum(counts{i},3)); % sum as double format. + summat = summat + sumcell{i}; +end +clear counts; diff --git a/matlab/linkage/transform2.m b/matlab/linkage/transform2.m new file mode 100644 index 0000000..b7e334d --- /dev/null +++ b/matlab/linkage/transform2.m @@ -0,0 +1,240 @@ +function [data_clique, data_separator, noalle_clique, noalle_separator] = transform2(data, component_mat, linkage_model) +% Filename: transform.m +% [data_clique,data_clique,noalle_clique, noalle_separator] = tranform2(data, component_mat, linkage_model) +% +% Description: +% Function for tranforming the profile data into clique data and separator data. +% For the linear graphical models, in each component, ncliques + nseparators = 2*nnodes - 3 +% holds for each block(component in the graph). + +% Author: Jing Tang +% Modified date: 18/10/2005 +% The function for treating missing data is added. + +% Input: +% data: the allelic profile data. Last col is the index vector. +% component_mat: the component matrix. +% isdependent: +% islinear: + +% Output: +% data_clique: [nindividuals x (ncliques+nsingletons)] +% data_separator: [nindividual x nseparators] + + +if strcmp(linkage_model,'linear') + nindividuals = size(data,1); + nloci = size(data,2)-1; + % if ~all(diag(adj_mat)), adj_mat = adj_mat + diag(ones(nloci,1));, end; + % [p,p,r]=dmperm(adj_mat); + + % UNSOLVED --- Need to find a better algorithm to count the numbers of + % separators ,cliques and singletons. + ncomponents = size(component_mat,1); + cardinality = sum(component_mat>0,2)'; + ncliques = sum(cardinality-1); + nsingletons = sum(cardinality==1); + nseparators = sum(cardinality(find(cardinality>1))-2); + + data_clique = zeros(nindividuals, ncliques+nsingletons); + noalle_clique = zeros(ncliques+nsingletons,1); + data_separator = zeros(nindividuals, nseparators); + noalle_separator = zeros(nseparators,1); + % data_separator = []; % used for vectorization + k = 1; + for i = 1:ncomponents + if (cardinality(i)==1) % singleton + data_clique(:,k) = data(:,component_mat(i,1)); + temp = unique(data(:,component_mat(i,1))); + if temp(1) == 0 % missing data found + noalle_clique(k) = size(temp,1) -1; + else + noalle_clique(k) = size(temp,1); + end + k = k+1; + else + for j = 1:cardinality(i)-1 + + % dealing with missing data + % nonmissing = find(all(data(:,[component_mat(i,j) component_mat(i,j+1)]),2)); + % [b,m,n] = unique(data(nonmissing,[component_mat(i,j) component_mat(i,j+1)]),'rows'); + % data_clique(nonmissing,k) = n; + % noalle_clique(k) = size(unique(data(nonmissing,component_mat(i,j))),1)*... + % size(unique(data(nonmissing,component_mat(i,j+1))),1); + + [b,m,n] = unique(data(:,[component_mat(i,j) component_mat(i,j+1)]),'rows'); + data_clique(:,k) = n; + + noalle_clique(k) = size(unique(data(:,component_mat(i,j))),1)*... + size(unique(data(:,component_mat(i,j+1))),1); + k = k+1; + end + + end + end + + k = 1; + for i = 1:ncomponents + if (cardinality(i)>2) + for j = 2:cardinality(i)-1 + data_separator(:,k) = data(:,component_mat(i,j)); + k = k+1; + end + end + end + if (k-1)~=nseparators + error('ERROR in transform.m'); + return + end + for i = 1:nseparators + % nonmissing = find(all(data_separator(:,i),2)); + % noalle_separator(i) = length(unique(data_separator(nonmissing,i))); + noalle_separator(i) = length(unique(data_separator(:,i))); + end + +elseif strcmp(linkage_model,'codon') + nindividuals = size(data,1); + nloci = size(data,2)-1; + ncomponents = size(component_mat,1); + cardinality = sum(component_mat>0,2)'; + + ncliques = sum(cardinality(find(cardinality>2))-2); + nsingletons = sum(cardinality==1 | cardinality ==2); + nseparators = sum(cardinality(find(cardinality>2))-3); + + data_clique = zeros(nindividuals,ncliques+nsingletons); + noalle_clique = zeros(ncliques+nsingletons,1); + data_seperator = zeros(nindividuals,nseparators); + noalle_separator = zeros(nseparators,1); + k = 1; + for i = 1:ncomponents + if (cardinality(i)==1) % singleton + data_clique(:,k) = data(:,component_mat(i,1)); + temp = unique(data(:,component_mat(i,1))); + if temp(1) == 0 % missing data found + noalle_clique(k) = size(temp,1) -1; + else + noalle_clique(k) = size(temp,1); + end + k = k+1; + elseif (cardinality(i)==2) % transform to linear linkage + % dealing with missing data + % nonmissing = find(all(data(:,[component_mat(i,1) component_mat(i,2)]),2)); + % [b,m,n] = unique(data(nonmissing,[component_mat(i,1) component_mat(i,2)]),'rows'); + + [b,m,n] = unique(data(:,[component_mat(i,1) component_mat(i,2)]),'rows'); + data_clique(:,k) = n; + % data_clique(nonmissing,k) = n; + + % noalle_clique(k) = size(unique(data(nonmissing,component_mat(i,1))),1)*... + % size(unique(data(nonmissing,component_mat(i,2))),1); + noalle_clique(k) = size(unique(data(:,component_mat(i,1))),1)*... + size(unique(data(:,component_mat(i,2))),1); + k = k+1; + else + for j = 1:cardinality(i)-2 + % dealing with missing data + % nonmissing = find(all(data(:,[component_mat(i,j) component_mat(i,j+1) component_mat(i,j+2)]),2)); + % [b,m,n] = unique(data(nonmissing,[component_mat(i,j) component_mat(i,j+1) component_mat(i,j+2)]),'rows'); + % data_clique(nonmissing,k) = n; + + [b,m,n] = unique(data(:,[component_mat(i,j) component_mat(i,j+1) component_mat(i,j+2)]),'rows'); + data_clique(:,k) = n; + + % noalle_clique(k) = size(unique(data(nonmissing,component_mat(i,j))),1)*... + % size(unique(data(nonmissing,component_mat(i,j+1))),1)*size(unique(data(nonmissing,component_mat(i,j+2))),1); + noalle_clique(k) = size(unique(data(:,component_mat(i,j))),1)*... + size(unique(data(:,component_mat(i,j+1))),1)*size(unique(data(:,component_mat(i,j+2))),1); + k = k+1; + end + end + end + + k = 1; + for i = 1:ncomponents + if (cardinality(i)>2) + for j = 2:cardinality(i)-2 + % dealing with missing data + % nonmissing = find(all(data(:,[component_mat(i,j) component_mat(i,j+1)]),2)); + % [b,m,n] = unique(data(nonmissing,[component_mat(i,j) component_mat(i,j+1)]),'rows'); + [b,m,n] = unique(data(:,[component_mat(i,j) component_mat(i,j+1)]),'rows'); + % data_separator(nonmissing,k) = n; + data_separator(:,k) = n; + % noalle_separator(k) = size(unique(data(nonmissing,component_mat(i,j))),1)*... + % size(unique(data(nonmissing,component_mat(i,j+1))),1); + noalle_separator(k) = size(unique(data(:,component_mat(i,j))),1)*... + size(unique(data(:,component_mat(i,j+1))),1); + k = k+1; + end + end + end + + +elseif strcmp(linkage_model,'independent') + data_clique = data(:,[1:end-1]); + data_separator = []; + noalle_clique = []; + noalle_separator = []; + +elseif strcmp(linkage_model,'random') + adj_mat = mk_adjmat(component_mat); + nloci = size(adj_mat,1); + ns = 4*ones(1,nloci); + porder = []; + stages = {[1:nloci]}; + clusters = {}; + + [jtree,root2,cliques,B,w]=graph_to_jtree(adj_mat,ns,porder,stages,clusters); + C = length(cliques); + [is,js] = find(jtree > 0); + separator = cell(C,C); + for k=1:length(is) + i = is(k); j = js(k); + separator{i,j} = find(B(i,:) & B(j,:)); % intersect(cliques{i}, cliques{j}); + end + + + nindividuals = size(data,1); + if nloci ~= size(data,2)-1 error('Error in transform3.m'); + return + end + ncliques = length(cliques); + [i,j] = find(~cellfun('isempty',triu(separator))); + ij = [i j]; + nseparators = length(i); + + data_clique = zeros(nindividuals, ncliques); + noalle_clique = zeros(ncliques,1); + data_separator = zeros(nindividuals, nseparators); + noalle_separator = zeros(nseparators,1); + + k = 1; + for i = 1:ncliques + [b,m,n] = unique(data(:,cliques{i}),'rows'); + data_clique(:,k) = n; + for j = 1:length(cliques{i}) + sz(j) = size(unique(data(:,cliques{i}(j))),1); + end + noalle_clique(k) = prod(sz); + k = k+1; + clear sz; + end + k = 1; + for i = 1:nseparators + [b,m,n] = unique(data(:,separator{ij(i,1),ij(i,2)}),'rows'); + data_separator(:,k) = n; + + for j = 1:length(separator{ij(i,1),ij(i,2)}) + sz(j) = size(unique(data(:,separator{ij(i,1),ij(i,2)}(j))),1); + end + noalle_separator(k) = prod(sz); + k = k+1; + clear sz; + end +end +clear data; + + + + + diff --git a/matlab/linkage/transform4.m b/matlab/linkage/transform4.m new file mode 100644 index 0000000..e43ca70 --- /dev/null +++ b/matlab/linkage/transform4.m @@ -0,0 +1,235 @@ +function [data_clique, data_separator, noalle_clique, noalle_separator] = transform4(data, component_mat, linkage_model) +% Filename: transform4.m +% [data_clique,data_clique,noalle_clique, noalle_separator] = tranform4(data, component_mat, linkage_model) + +% DATA SHRINKAGE STEP IN THE END TO SAVE MEMORY USE +% Further shrink the data using uint8 format. +% Make sure that allele numbers smaller than 256. + + +if strcmp(linkage_model,'linear') + nindividuals = size(data,1); + nloci = size(data,2)-1; + % if ~all(diag(adj_mat)), adj_mat = adj_mat + diag(ones(nloci,1));, end; + % [p,p,r]=dmperm(adj_mat); + + % UNSOLVED --- Need to find a better algorithm to count the numbers of + % separators ,cliques and singletons. + ncomponents = size(component_mat,1); + cardinality = sum(component_mat>0,2)'; + ncliques = sum(cardinality-1); + nsingletons = sum(cardinality==1); + nseparators = sum(cardinality(find(cardinality>1))-2); + + data_clique = zeros(nindividuals, ncliques+nsingletons,'uint16'); + noalle_clique = zeros(ncliques+nsingletons,1,'uint16'); + data_separator = zeros(nindividuals, nseparators,'uint16'); + noalle_separator = zeros(nseparators,1,'uint16'); + % data_separator = []; % used for vectorization + k = 1; + for i = 1:ncomponents + if (cardinality(i)==1) % singleton + data_clique(:,k) = data(:,component_mat(i,1)); + temp = unique(data(:,component_mat(i,1))); + if temp(1) == 0 % missing data found + noalle_clique(k) = size(temp,1) -1; + else + noalle_clique(k) = size(temp,1); + end + k = k+1; + else + for j = 1:cardinality(i)-1 + + % dealing with missing data + % nonmissing = find(all(data(:,[component_mat(i,j) component_mat(i,j+1)]),2)); + % [b,m,n] = unique(data(nonmissing,[component_mat(i,j) component_mat(i,j+1)]),'rows'); + % data_clique(nonmissing,k) = n; + % noalle_clique(k) = size(unique(data(nonmissing,component_mat(i,j))),1)*... + % size(unique(data(nonmissing,component_mat(i,j+1))),1); + + [b,m,n] = unique(data(:,[component_mat(i,j) component_mat(i,j+1)]),'rows'); + data_clique(:,k) = n; + + noalle_clique(k) = size(unique(data(:,component_mat(i,j))),1)*... + size(unique(data(:,component_mat(i,j+1))),1); + k = k+1; + end + + end + end + + k = 1; + for i = 1:ncomponents + if (cardinality(i)>2) + for j = 2:cardinality(i)-1 + data_separator(:,k) = data(:,component_mat(i,j)); + k = k+1; + end + end + end + if (k-1)~=nseparators + error('ERROR in transform.m'); + return + end + for i = 1:nseparators + % nonmissing = find(all(data_separator(:,i),2)); + % noalle_separator(i) = length(unique(data_separator(nonmissing,i))); + noalle_separator(i) = length(unique(data_separator(:,i))); + end + +elseif strcmp(linkage_model,'codon') + nindividuals = size(data,1); + nloci = size(data,2)-1; + ncomponents = size(component_mat,1); + cardinality = sum(component_mat>0,2)'; + + ncliques = sum(cardinality(find(cardinality>2))-2); + nsingletons = sum(cardinality==1 | cardinality ==2); + nseparators = sum(cardinality(find(cardinality>2))-3); + + data_clique = zeros(nindividuals,ncliques+nsingletons,'uint16'); + noalle_clique = zeros(ncliques+nsingletons,1,'uint16'); + data_separator = zeros(nindividuals,nseparators,'uint16'); + noalle_separator = zeros(nseparators,1,'uint16'); + k = 1; + for i = 1:ncomponents + if (cardinality(i)==1) % singleton + data_clique(:,k) = data(:,component_mat(i,1)); + temp = unique(data(:,component_mat(i,1))); + if temp(1) == 0 % missing data found + noalle_clique(k) = size(temp,1) -1; + else + noalle_clique(k) = size(temp,1); + end + k = k+1; + elseif (cardinality(i)==2) % transform to linear linkage + % dealing with missing data + % nonmissing = find(all(data(:,[component_mat(i,1) component_mat(i,2)]),2)); + % [b,m,n] = unique(data(nonmissing,[component_mat(i,1) component_mat(i,2)]),'rows'); + + [b,m,n] = unique(data(:,[component_mat(i,1) component_mat(i,2)]),'rows'); + data_clique(:,k) = uint16(n); + % data_clique(nonmissing,k) = n; + + % noalle_clique(k) = size(unique(data(nonmissing,component_mat(i,1))),1)*... + % size(unique(data(nonmissing,component_mat(i,2))),1); + noalle_clique(k) = size(unique(data(:,component_mat(i,1))),1)*... + size(unique(data(:,component_mat(i,2))),1); + k = k+1; + else + for j = 1:cardinality(i)-2 + % dealing with missing data + % nonmissing = find(all(data(:,[component_mat(i,j) component_mat(i,j+1) component_mat(i,j+2)]),2)); + % [b,m,n] = unique(data(nonmissing,[component_mat(i,j) component_mat(i,j+1) component_mat(i,j+2)]),'rows'); + % data_clique(nonmissing,k) = n; + + [b,m,n] = unique(data(:,[component_mat(i,j) component_mat(i,j+1) component_mat(i,j+2)]),'rows'); + data_clique(:,k) = uint16(n); + + % noalle_clique(k) = size(unique(data(nonmissing,component_mat(i,j))),1)*... + % size(unique(data(nonmissing,component_mat(i,j+1))),1)*size(unique(data(nonmissing,component_mat(i,j+2))),1); + noalle_clique(k) = size(unique(data(:,component_mat(i,j))),1)*... + size(unique(data(:,component_mat(i,j+1))),1)*size(unique(data(:,component_mat(i,j+2))),1); + k = k+1; + end + end + end + + k = 1; + for i = 1:ncomponents + if (cardinality(i)>2) + for j = 2:cardinality(i)-2 + % dealing with missing data + % nonmissing = find(all(data(:,[component_mat(i,j) component_mat(i,j+1)]),2)); + % [b,m,n] = unique(data(nonmissing,[component_mat(i,j) component_mat(i,j+1)]),'rows'); + [b,m,n] = unique(data(:,[component_mat(i,j) component_mat(i,j+1)]),'rows'); + % data_separator(nonmissing,k) = n; + data_separator(:,k) = uint16(n); + % noalle_separator(k) = size(unique(data(nonmissing,component_mat(i,j))),1)*... + % size(unique(data(nonmissing,component_mat(i,j+1))),1); + noalle_separator(k) = size(unique(data(:,component_mat(i,j))),1)*... + size(unique(data(:,component_mat(i,j+1))),1); + k = k+1; + end + end + end + + +elseif strcmpi(linkage_model,'Independent') + data_clique = data(:,[1:end-1]); + data_separator = []; + noalle_clique = max(data_clique); % Be sure the data is normalized. + noalle_separator = []; + +elseif strcmp(linkage_model,'random') + adj_mat = mk_adjmat(component_mat); + nloci = size(adj_mat,1); + ns = 4*ones(1,nloci); + porder = []; + stages = {[1:nloci]}; + clusters = {}; + + [jtree,root2,cliques,B,w]=graph_to_jtree(adj_mat,ns,porder,stages,clusters); + C = length(cliques); + [is,js] = find(jtree > 0); + separator = cell(C,C); + for k=1:length(is) + i = is(k); j = js(k); + separator{i,j} = find(B(i,:) & B(j,:)); % intersect(cliques{i}, cliques{j}); + end + + + nindividuals = size(data,1); + if nloci ~= size(data,2)-1 error('*** ERROR: in transform3.m'); + return + end + ncliques = length(cliques); + [i,j] = find(~cellfun('isempty',triu(separator))); + ij = [i j]; + nseparators = length(i); + + data_clique = zeros(nindividuals, ncliques); + noalle_clique = zeros(ncliques,1); + data_separator = zeros(nindividuals, nseparators); + noalle_separator = zeros(nseparators,1); + + k = 1; + for i = 1:ncliques + [b,m,n] = unique(data(:,cliques{i}),'rows'); + data_clique(:,k) = n; +% for j = 1:length(cliques{i}) +% sz(j) = size(unique(data(:,cliques{i}(j))),1); +% end +% noalle_clique(k) = prod(sz); + noalle_clique(k) = 4^length(cliques{i}); + k = k+1; +% clear sz; + end + k = 1; + for i = 1:nseparators + [b,m,n] = unique(data(:,separator{ij(i,1),ij(i,2)}),'rows'); + data_separator(:,k) = n; + +% for j = 1:length(separator{ij(i,1),ij(i,2)}) +% sz(j) = size(unique(data(:,separator{ij(i,1),ij(i,2)}(j))),1); +% end +% noalle_separator(k) = prod(sz); + noalle_separator(k) = 4^length(separator{ij(i,1),ij(i,2)}); + k = k+1; +% clear sz; + end +end + +% Data shrinkage step +informative_clique = find(noalle_clique~=1); +informative_separator = find(noalle_separator~=1); +data_clique = data_clique(:,informative_clique); +data_separator = data_separator(:,informative_separator); +noalle_clique = noalle_clique(informative_clique); +noalle_separator = noalle_separator(informative_separator); +clear data; + + + + + diff --git a/matlab/linkage/transform5.m b/matlab/linkage/transform5.m new file mode 100644 index 0000000..cf2c119 --- /dev/null +++ b/matlab/linkage/transform5.m @@ -0,0 +1,265 @@ +function [data_clique, data_separator, noalle_clique, noalle_separator, alphabet_cq, alphabet_sp, informative_clique, informative_separator] = ... + transform5(data, component_mat, varargin) +% Filename: transform4.m (modified, Lu Cheng, 17.02.2010) +% [data_clique,data_clique,noalle_clique, noalle_separator] = tranform4(data, component_mat, linkage_model) + +% DATA SHRINKAGE STEP IN THE END TO SAVE MEMORY USE +% Further shrink the data using uint8 format. +% Make sure that allele numbers smaller than 256. + +% Modified by Lu Cheng from transform4.m, 17.02.2010 +% New: The alphabet at each locus is stored in alphabet_cq and alphabet_sp +% And here we also assume there is no missing data + +linkage_model = 'codon'; + +if strcmp(linkage_model,'linear') + nindividuals = size(data,1); + nloci = size(data,2)-1; + % if ~all(diag(adj_mat)), adj_mat = adj_mat + diag(ones(nloci,1));, end; + % [p,p,r]=dmperm(adj_mat); + + % UNSOLVED --- Need to find a better algorithm to count the numbers of + % separators ,cliques and singletons. + ncomponents = size(component_mat,1); + cardinality = sum(component_mat>0,2)'; + ncliques = sum(cardinality-1); + nsingletons = sum(cardinality==1); + nseparators = sum(cardinality(find(cardinality>1))-2); + + data_clique = zeros(nindividuals, ncliques+nsingletons,'uint16'); + noalle_clique = zeros(ncliques+nsingletons,1,'uint16'); + data_separator = zeros(nindividuals, nseparators,'uint16'); + noalle_separator = zeros(nseparators,1,'uint16'); + % data_separator = []; % used for vectorization + k = 1; + for i = 1:ncomponents + if (cardinality(i)==1) % singleton + data_clique(:,k) = data(:,component_mat(i,1)); + temp = unique(data(:,component_mat(i,1))); + if temp(1) == 0 % missing data found + noalle_clique(k) = size(temp,1) -1; + else + noalle_clique(k) = size(temp,1); + end + k = k+1; + else + for j = 1:cardinality(i)-1 + + % dealing with missing data + % nonmissing = find(all(data(:,[component_mat(i,j) component_mat(i,j+1)]),2)); + % [b,m,n] = unique(data(nonmissing,[component_mat(i,j) component_mat(i,j+1)]),'rows'); + % data_clique(nonmissing,k) = n; + % noalle_clique(k) = size(unique(data(nonmissing,component_mat(i,j))),1)*... + % size(unique(data(nonmissing,component_mat(i,j+1))),1); + + [b,m,n] = unique(data(:,[component_mat(i,j) component_mat(i,j+1)]),'rows'); + data_clique(:,k) = n; + + noalle_clique(k) = size(unique(data(:,component_mat(i,j))),1)*... + size(unique(data(:,component_mat(i,j+1))),1); + k = k+1; + end + + end + end + + k = 1; + for i = 1:ncomponents + if (cardinality(i)>2) + for j = 2:cardinality(i)-1 + data_separator(:,k) = data(:,component_mat(i,j)); + k = k+1; + end + end + end + if (k-1)~=nseparators + error('ERROR in transform.m'); + return + end + for i = 1:nseparators + % nonmissing = find(all(data_separator(:,i),2)); + % noalle_separator(i) = length(unique(data_separator(nonmissing,i))); + noalle_separator(i) = length(unique(data_separator(:,i))); + end + +elseif strcmp(linkage_model,'codon') + nindividuals = size(data,1); + nloci = size(data,2)-1; + ncomponents = size(component_mat,1); + cardinality = sum(component_mat>0,2)'; % length of each gene + + ncliques = sum(cardinality(cardinality>2)-2); + nsingletons = sum(cardinality==1 | cardinality ==2); + nseparators = sum(cardinality(cardinality>2)-3); + + data_clique = zeros(nindividuals,ncliques+nsingletons,'uint16'); + noalle_clique = zeros(ncliques+nsingletons,1,'uint16'); + data_separator = zeros(nindividuals,nseparators,'uint16'); + noalle_separator = zeros(nseparators,1,'uint16'); + + alphabet_cq = cell(1,ncliques+nsingletons); + alphabet_sp = cell(1,nseparators); + + k = 1; + for i = 1:ncomponents + if (cardinality(i)==1) % singleton + [b, m, n] = unique(data(:,component_mat(i,1))); + + alphabet_cq{k} = b; + data_clique(:,k) = unit16(n); + +% temp = unique(data(:,component_mat(i,1))); +% if temp(1) == 0 % missing data found +% noalle_clique(k) = size(temp,1) -1; +% else +% noalle_clique(k) = size(temp,1); +% end + k = k+1; + elseif (cardinality(i)==2) % transform to linear linkage + % dealing with missing data + % nonmissing = find(all(data(:,[component_mat(i,1) component_mat(i,2)]),2)); + % [b,m,n] = unique(data(nonmissing,[component_mat(i,1) component_mat(i,2)]),'rows'); + + [b,m,n] = unique(data(:,[component_mat(i,1) component_mat(i,2)]),'rows'); + data_clique(:,k) = uint16(n); + alphabet_cq{k} = b; + + % data_clique(nonmissing,k) = n; + + % noalle_clique(k) = size(unique(data(nonmissing,component_mat(i,1))),1)*... + % size(unique(data(nonmissing,component_mat(i,2))),1); + noalle_clique(k) = size(unique(data(:,component_mat(i,1))),1)*... + size(unique(data(:,component_mat(i,2))),1); + k = k+1; + else + for j = 1:cardinality(i)-2 + % dealing with missing data + % nonmissing = find(all(data(:,[component_mat(i,j) component_mat(i,j+1) component_mat(i,j+2)]),2)); + % [b,m,n] = unique(data(nonmissing,[component_mat(i,j) component_mat(i,j+1) component_mat(i,j+2)]),'rows'); + % data_clique(nonmissing,k) = n; + + [b,m,n] = unique(data(:,component_mat(i,j:j+2)),'rows'); + data_clique(:,k) = uint16(n); + alphabet_cq{k} = b; + + % noalle_clique(k) = size(unique(data(nonmissing,component_mat(i,j))),1)*... + % size(unique(data(nonmissing,component_mat(i,j+1))),1)*size(unique(data(nonmissing,component_mat(i,j+2))),1); + noalle_clique(k) = size(unique(data(:,component_mat(i,j))),1)*... + size(unique(data(:,component_mat(i,j+1))),1)*size(unique(data(:,component_mat(i,j+2))),1); + k = k+1; + end + end + end + + k = 1; + for i = 1:ncomponents + if (cardinality(i)>2) + for j = 2:cardinality(i)-2 + % dealing with missing data + % nonmissing = find(all(data(:,[component_mat(i,j) component_mat(i,j+1)]),2)); + % [b,m,n] = unique(data(nonmissing,[component_mat(i,j) component_mat(i,j+1)]),'rows'); + [b,m,n] = unique(data(:,[component_mat(i,j) component_mat(i,j+1)]),'rows'); + % data_separator(nonmissing,k) = n; + data_separator(:,k) = uint16(n); + alphabet_sp{k} = b; + + % noalle_separator(k) = size(unique(data(nonmissing,component_mat(i,j))),1)*... + % size(unique(data(nonmissing,component_mat(i,j+1))),1); + noalle_separator(k) = size(unique(data(:,component_mat(i,j))),1)*... + size(unique(data(:,component_mat(i,j+1))),1); + k = k+1; + end + end + end + + +elseif strcmpi(linkage_model,'Independent') + data_clique = data(:,[1:end-1]); + data_separator = []; + noalle_clique = max(data_clique); % Be sure the data is normalized. + noalle_separator = []; + +elseif strcmp(linkage_model,'random') + adj_mat = mk_adjmat(component_mat); + nloci = size(adj_mat,1); + ns = 4*ones(1,nloci); + porder = []; + stages = {[1:nloci]}; + clusters = {}; + + [jtree,root2,cliques,B,w]=graph_to_jtree(adj_mat,ns,porder,stages,clusters); + C = length(cliques); + [is,js] = find(jtree > 0); + separator = cell(C,C); + for k=1:length(is) + i = is(k); j = js(k); + separator{i,j} = find(B(i,:) & B(j,:)); % intersect(cliques{i}, cliques{j}); + end + + + nindividuals = size(data,1); + if nloci ~= size(data,2)-1 error('*** ERROR: in transform3.m'); + return + end + ncliques = length(cliques); + [i,j] = find(~cellfun('isempty',triu(separator))); + ij = [i j]; + nseparators = length(i); + + data_clique = zeros(nindividuals, ncliques); + noalle_clique = zeros(ncliques,1); + data_separator = zeros(nindividuals, nseparators); + noalle_separator = zeros(nseparators,1); + + k = 1; + for i = 1:ncliques + [b,m,n] = unique(data(:,cliques{i}),'rows'); + data_clique(:,k) = n; +% for j = 1:length(cliques{i}) +% sz(j) = size(unique(data(:,cliques{i}(j))),1); +% end +% noalle_clique(k) = prod(sz); + noalle_clique(k) = 4^length(cliques{i}); + k = k+1; +% clear sz; + end + k = 1; + for i = 1:nseparators + [b,m,n] = unique(data(:,separator{ij(i,1),ij(i,2)}),'rows'); + data_separator(:,k) = n; + +% for j = 1:length(separator{ij(i,1),ij(i,2)}) +% sz(j) = size(unique(data(:,separator{ij(i,1),ij(i,2)}(j))),1); +% end +% noalle_separator(k) = prod(sz); + noalle_separator(k) = 4^length(separator{ij(i,1),ij(i,2)}); + k = k+1; +% clear sz; + end +end + +% Data shrinkage step + +if ~isempty(varargin) + informative_clique = varargin{1}; + informative_separator = varargin{2}; +else + informative_clique = find(noalle_clique~=1); + informative_separator = find(noalle_separator~=1); +end + +data_clique = data_clique(:,informative_clique); +data_separator = data_separator(:,informative_separator); + +alphabet_cq = alphabet_cq(informative_clique); +alphabet_sp = alphabet_sp(informative_separator); + +noalle_clique = noalle_clique(informative_clique); +noalle_separator = noalle_separator(informative_separator); +clear data; + + + + + diff --git a/matlab/linkage/xls2structure.m b/matlab/linkage/xls2structure.m new file mode 100644 index 0000000..3e47e20 --- /dev/null +++ b/matlab/linkage/xls2structure.m @@ -0,0 +1,19 @@ +function xls2structure(source, target) +% XLS2STRUCTURE converts a MLST data in xls format into a Structure data + +[data, component_mat, popnames] = processxls(source); + +data = data(:,[1:end-1]); +[ninds, nloci] = size(data); + +fid = fopen(target,'w'); +if (fid ~= -1) + for i = 1:ninds + fprintf(fid,'%s\t', popnames{i,1}{1}); + for j = 1:nloci + fprintf(fid,'%d\t',data(i,j)); + end + fprintf(fid,'\n'); + end +end +fclose(fid); diff --git a/matlab/parallel/admix_parallel.m b/matlab/parallel/admix_parallel.m new file mode 100644 index 0000000..210e297 --- /dev/null +++ b/matlab/parallel/admix_parallel.m @@ -0,0 +1,776 @@ +function admix_parallel(options) +% ADMIX_PARALLEL is the command line version of the baps partition with +% admixture models. +% Input: options is a struct generated by parallel.m + +%-------------------------------------------------------------------------- +%- Syntax check out +%-------------------------------------------------------------------------- +rand('state',0); % used for debugging +outp = [options.outputMat '.txt']; +inp = options.dataFile; +clusters = options.clusters; +fprintf(1,'Parallel computing...\n'); +fprintf(1,'Admixture analysis for cluster(s): %s.\n',num2str(clusters)); + +global COUNTS; global PARTITION; global SUMCOUNTS; +clearGlobalVars; + +struct_array = load(options.dataFile); +if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'PARTITION') | ~isfield(c,'rowsFromInd') + disp('*** ERROR: Incorrect data format'); + return + end +elseif isfield(struct_array,'PARTITION') %Mideva versio + c = struct_array; + if ~isfield(c,'rowsFromInd') + disp('*** ERROR: Incorrect data format'); + return + end +else + disp('*** ERROR: Incorrect data format'); + return; +end + + +if isfield(c, 'gene_lengths') && ... + (strcmp(c.mixtureType,'linear_mix') | ... + strcmp(c.mixtureType,'codon_mix')) % if the mixture is from a linkage model + % Redirect the call to the linkage admixture function. + fprintf(1,'Redirecting to Linkage Model Admixture\n'); + c.data = noIndex(c.data,c.noalle); % call function noindex to remove the index column + linkage_admix_parallel(c,options); + return + end + +% This section is disabled, -Jing 27.10.2009 +% if isfield(c, 'gene_lengths') && ... +% (strcmp(c.mixtureType,'linkage_mix') | ... +% strcmp(c.mixtureType,'codon_mix')) % if the mixture is from a linkage model +% % Redirect the call to the linkage admixture function. +% c.data = noIndex(c.data,c.noalle); % call function noindex to remove the index column +% linkage_admix(c); +% return +% end + +PARTITION = c.PARTITION; COUNTS = c.COUNTS; SUMCOUNTS = c.SUMCOUNTS; +alleleCodes = c.alleleCodes; adjprior = c.adjprior; popnames = c.popnames; +rowsFromInd = c.rowsFromInd; data = c.data; npops = c.npops; noalle = c.noalle; + + +% answers = inputdlg({['Input the minimum size of a population that will'... +% ' be taken into account when admixture is estimated.']},... +% 'Input minimum population size',[1],... +% {'5'}); +% if isempty(answers) return; end +% ------------------------------------------- +% NEW: for parallel computing +% ------------------------------------------- +alaRaja = options.minSize; +[npops, clusterIndex] = poistaLiianPienet(npops, rowsFromInd, alaRaja); +if length(clusterIndex)0 + osuusTaulu(PARTITION(ind)) = 1; + else + % Yksilöt, joita ei ole sijoitettu mihinkään koriin. + arvot = zeros(1,npops); + for q=1:npops + osuusTaulu = zeros(1,npops); + osuusTaulu(q) = 1; + arvot(q) = computeIndLogml(omaFreqs, osuusTaulu); + end + [iso_arvo, isoimman_indeksi] = max(arvot); + osuusTaulu = zeros(1,npops); + osuusTaulu(isoimman_indeksi) = 1; + PARTITION(ind)=isoimman_indeksi; + end + logml = computeIndLogml(omaFreqs, osuusTaulu); + logmlAlku = logml; + for osuus = [0.5 0.25 0.05 0.01] + [osuusTaulu, logml] = etsiParas(osuus, osuusTaulu, omaFreqs, logml); + end + logmlLoppu = logml; + likelihood(ind) = logmlLoppu-logmlAlku; + end +end + +% Analyze further only individuals who have log-likelihood ratio larger than 3: +% --------------------------------------- +% PARALLEL COMPUTING IMPLEMENTED HERE +% --------------------------------------- +to_investigate = (find(likelihood>3))'; +admix_populaatiot = unique(PARTITION(to_investigate)); +validCluster = intersect(clusters, admix_populaatiot); % for the chosen clusters + +ix = zeros(length(to_investigate),1); +for i = 1:length(validCluster) + ix = ix | (PARTITION(to_investigate) == validCluster(i)); +end +if ~any(ix) to_investigate = []; +else to_investigate = to_investigate(ix); +end +admix_populaatiot = unique(PARTITION(to_investigate)); + +disp('Possibly admixed individuals in the chosen clusters: '); +if isempty(to_investigate) + disp('none'); + disp('Admixture analysis terminated.'); + return +else + for i = 1:length(to_investigate) + disp(num2str(to_investigate(i))); + end +end + +disp(' '); +disp('clusters for possibly admixed individuals: '); + +for i = 1:length(admix_populaatiot) + disp(num2str(admix_populaatiot(i))); +end + +% THUS, there are two types of individuals, who will not be analyzed with +% simulated allele frequencies: those who belonged to a mini-population +% which was removed, and those who have log-likelihood ratio less than 3. +% The value in the PARTITION for the first kind of individuals is 0. The +% second kind of individuals can be identified, because they do not +% belong to "to_investigate" array. When the results are presented, the +% first kind of individuals are omitted completely, while the second kind +% of individuals are completely put to the population, where they ended up +% in the mixture analysis. These second type of individuals will have a +% unit p-value. + + +% Simulate allele frequencies a given number of times and save the average +% result to "proportionsIt" array. + +proportionsIt = zeros(ninds,npops); +for iterationNum = 1:iterationCount + disp(['Iter: ' num2str(iterationNum)]); + allfreqs = simulateAllFreqs(noalle); % Allele frequencies on this iteration. + + for ind=to_investigate + %disp(num2str(ind)); + omaFreqs = computePersonalAllFreqs(ind, data, allfreqs, rowsFromInd); + osuusTaulu = zeros(1,npops); + if PARTITION(ind)==0 + % Yksil?on outlier + elseif PARTITION(ind)~=0 + if PARTITION(ind)>0 + osuusTaulu(PARTITION(ind)) = 1; + else + % Yksilöt, joita ei ole sijoitettu mihinkään koriin. + arvot = zeros(1,npops); + for q=1:npops + osuusTaulu = zeros(1,npops); + osuusTaulu(q) = 1; + arvot(q) = computeIndLogml(omaFreqs, osuusTaulu); + end + [iso_arvo, isoimman_indeksi] = max(arvot); + osuusTaulu = zeros(1,npops); + osuusTaulu(isoimman_indeksi) = 1; + PARTITION(ind)=isoimman_indeksi; + end + logml = computeIndLogml(omaFreqs, osuusTaulu); + + for osuus = [0.5 0.25 0.05 0.01] + [osuusTaulu, logml] = etsiParas(osuus, osuusTaulu, omaFreqs, logml); + end + end + proportionsIt(ind,:) = proportionsIt(ind,:).*(iterationNum-1) + osuusTaulu; + proportionsIt(ind,:) = proportionsIt(ind,:)./iterationNum; + end +end + +%disp(['Creating ' num2str(nrefIndsInPop) ' reference individuals from ']); +%disp('each population.'); + +%allfreqs = simulateAllFreqs(noalle); % Simuloidaan alleelifrekvenssisetti +allfreqs = computeAllFreqs2(noalle); % Koitetaan tällaista. + + +% Initialize the data structures, which are required in taking the missing +% data into account: +n_missing_levels = zeros(npops,1); % number of different levels of "missingness" in each pop (max 3). +missing_levels = zeros(npops,3); % the mean values for different levels. +missing_level_partition = zeros(ninds,1); % level of each individual (one of the levels of its population). +for i=1:npops + inds = find(PARTITION==i); + % Proportions of non-missing data for the individuals: + non_missing_data = zeros(length(inds),1); + for j = 1:length(inds) + ind = inds(j); + non_missing_data(j) = length(find(data((ind-1)*rowsFromInd+1:ind*rowsFromInd,:)>0)) ./ (rowsFromInd*nloci); + end + if all(non_missing_data>0.9) + n_missing_levels(i) = 1; + missing_levels(i,1) = mean(non_missing_data); + missing_level_partition(inds) = 1; + else + [ordered, ordering] = sort(non_missing_data); + %part = learn_simple_partition(ordered, 0.05); + part = learn_partition_modified(ordered); + aux = sortrows([part ordering],2); + part = aux(:,1); + missing_level_partition(inds) = part; + n_levels = length(unique(part)); + n_missing_levels(i) = n_levels; + for j=1:n_levels + missing_levels(i,j) = mean(non_missing_data(find(part==j))); + end + end +end + +% Create and analyse reference individuals for populations +% with potentially admixed individuals: +refTaulu = zeros(npops,100,3); +for pop = admix_populaatiot' + + for level = 1:n_missing_levels(pop) + + potential_inds_in_this_pop_and_level = ... + find(PARTITION==pop & missing_level_partition==level &... + likelihood>3); % Potential admix individuals here. + + if ~isempty(potential_inds_in_this_pop_and_level) + + %refData = simulateIndividuals(nrefIndsInPop,rowsFromInd,allfreqs); + refData = simulateIndividuals(nrefIndsInPop, rowsFromInd, allfreqs, ... + pop, missing_levels(pop,level)); + + disp(['Analysing the reference individuals from pop ' num2str(pop) ' (level ' num2str(level) ').']); + refProportions = zeros(nrefIndsInPop,npops); + for iter = 1:iterationCountRef + %disp(['Iter: ' num2str(iter)]); + allfreqs = simulateAllFreqs(noalle); + + for ind = 1:nrefIndsInPop + omaFreqs = computePersonalAllFreqs(ind, refData, allfreqs, rowsFromInd); + osuusTaulu = zeros(1,npops); + osuusTaulu(pop)=1; + logml = computeIndLogml(omaFreqs, osuusTaulu); + for osuus = [0.5 0.25 0.05 0.01] + [osuusTaulu, logml] = etsiParas(osuus, osuusTaulu, omaFreqs, logml); + end + refProportions(ind,:) = refProportions(ind,:).*(iter-1) + osuusTaulu; + refProportions(ind,:) = refProportions(ind,:)./iter; + end + end + for ind = 1:nrefIndsInPop + omanOsuus = refProportions(ind,pop); + if round(omanOsuus*100)==0 + omanOsuus = 0.01; + end + if abs(omanOsuus)<1e-5 + omanOsuus = 0.01; + end + refTaulu(pop, round(omanOsuus*100),level) = refTaulu(pop, round(omanOsuus*100),level)+1; + end + end + end +end + +% Rounding of the results: +proportionsIt = proportionsIt.*100; proportionsIt = round(proportionsIt); +proportionsIt = proportionsIt./100; +for ind = 1:ninds + if ~any(to_investigate==ind) + if PARTITION(ind)>0 + proportionsIt(ind,PARTITION(ind))=1; + end + else + % In case of a rounding error, the sum is made equal to unity by + % fixing the largest value. + if (PARTITION(ind)>0) & (sum(proportionsIt(ind,:)) ~= 1) + [isoin,indeksi] = max(proportionsIt(ind,:)); + erotus = sum(proportionsIt(ind,:))-1; + proportionsIt(ind,indeksi) = isoin-erotus; + end + end +end + +% Calculate p-value for each individual: +uskottavuus = zeros(ninds,1); +for ind = 1:ninds + pop = PARTITION(ind); + if pop==0 % Individual is outlier + uskottavuus(ind)=1; + elseif isempty(find(to_investigate==ind)) + % Individual had log-likelihood ratio<3 + uskottavuus(ind)=1; + else + omanOsuus = proportionsIt(ind,pop); + if abs(omanOsuus)<1e-5 + omanOsuus = 0.01; + end + if round(omanOsuus*100)==0 + omanOsuus = 0.01; + end + level = missing_level_partition(ind); + refPienempia = sum(refTaulu(pop, 1:round(100*omanOsuus), level)); + uskottavuus(ind) = refPienempia / nrefIndsInPop; + end +end + +% tulostaAdmixtureTiedot(proportionsIt, uskottavuus, alaRaja, iterationCount); + +% viewPartition(proportionsIt, popnames); + + +c.proportionsIt = proportionsIt; +c.pvalue = uskottavuus; % Added by Jing +c.mixtureType = 'admix'; % Jing +c.admixnpops = npops; + +c.clusters = clusters; % added for parallel computing +c.minsize = alaRaja; +c.iters = iterationCount; +c.refInds = nrefIndsInPop; +c.refIters = iterationCountRef; + + +fprintf(1,'Saving the result...') +try +% save(options.outputMat, 'c'); + save(options.outputMat, 'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + fprintf(1,'Finished.\n'); +catch + display('*** ERROR in saving the result.'); +end + + +%---------------------------------------------------------------------------- + + +function [npops, clusterIndex] = poistaLiianPienet(npops, rowsFromInd, alaraja) +% Muokkaa tulokset muotoon, jossa outlier yksilöt on +% poistettu. Tarkalleen ottaen poistaa ne populaatiot, +% joissa on vähemmän kuin 'alaraja':n verran yksilöit? + +% NEW: clusterIndex is output for parallel computing. - Jing +global PARTITION; +global COUNTS; +global SUMCOUNTS; + +clusterIndex = [1:npops]'; +popSize=zeros(1,npops); +for i=1:npops + popSize(i)=length(find(PARTITION==i)); +end +miniPops = find(popSize0))); +for n=1:length(korit) + kori = korit(n); + yksilot = find(PARTITION==kori); + PARTITION(yksilot) = n; + clusterIndex(kori) = n; +end +COUNTS(:,:,miniPops) = []; +SUMCOUNTS(miniPops,:) = []; + +npops = npops-length(miniPops); + +%------------------------------------------------------------------------ + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; + +%-------------------------------------------------------- + + +function allFreqs = computeAllFreqs2(noalle) +% Lisää a priori jokaista alleelia +% joka populaation joka lokukseen j 1/noalle(j) verran. + +global COUNTS; +global SUMCOUNTS; + +max_noalle = size(COUNTS,1); +nloci = size(COUNTS,2); +npops = size(COUNTS,3); + +sumCounts = SUMCOUNTS+ones(size(SUMCOUNTS)); +sumCounts = reshape(sumCounts', [1, nloci, npops]); +sumCounts = repmat(sumCounts, [max_noalle, 1 1]); + +prioriAlleelit = zeros(max_noalle,nloci); +for j=1:nloci + prioriAlleelit(1:noalle(j),j) = 1/noalle(j); +end +prioriAlleelit = repmat(prioriAlleelit, [1,1,npops]); +counts = COUNTS + prioriAlleelit; +allFreqs = counts./sumCounts; + + +function allfreqs = simulateAllFreqs(noalle) +% Lisää jokaista alleelia joka populaation joka lokukseen j 1/noalle(j) +% verran. Näin saatuja counts:eja vastaavista Dirichlet-jakaumista +% simuloidaan arvot populaatioiden alleelifrekvensseille. + +global COUNTS; + +max_noalle = size(COUNTS,1); +nloci = size(COUNTS,2); +npops = size(COUNTS,3); + +prioriAlleelit = zeros(max_noalle,nloci); +for j=1:nloci + prioriAlleelit(1:noalle(j),j) = 1/noalle(j); +end +prioriAlleelit = repmat(prioriAlleelit, [1,1,npops]); +counts = COUNTS + prioriAlleelit; +allfreqs = zeros(size(counts)); + +for i=1:npops + for j=1:nloci + simuloidut = randdir(counts(1:noalle(j),j,i) , noalle(j)); + allfreqs(1:noalle(j),j,i) = simuloidut; + end +end + +%-------------------------------------------------------------------------- + + +function refData = simulateIndividuals(n,rowsFromInd,allfreqs,pop, missing_level) +% simulate n individuals from population pop, such that approximately +% proportion "missing_level" of the alleles are present. + +nloci = size(allfreqs,2); + +refData = zeros(n*rowsFromInd,nloci); +counter = 1; % which row will be generated next. + +for ind = 1:n + for loc = 1:nloci + for k=0:rowsFromInd-1 + if randarvo); +all = min(isommat); + + +%-------------------------------------------------------------------------- + + +function omaFreqs = computePersonalAllFreqs(ind, data, allFreqs, rowsFromInd) +% Laskee npops*(rowsFromInd*nloci) taulukon, jonka kutakin saraketta +% vastaa yksilön ind alleeli. Eri rivit ovat alleelin alkuperäfrekvenssit +% eri populaatioissa. Jos yksilölt?puuttuu jokin alleeli, niin vastaavaan +% kohtaa tulee sarake ykkösi? + +global COUNTS; +nloci = size(COUNTS,2); +npops = size(COUNTS,3); + +rows = data(computeRows(rowsFromInd, ind, 1),:); + +omaFreqs = zeros(npops, (rowsFromInd*nloci)); +pointer = 1; +for loc=1:size(rows,2) + for all=1:size(rows,1) + if rows(all,loc)>=0 + try, + omaFreqs(:,pointer) = ... + reshape(allFreqs(rows(all,loc),loc,:), [npops,1]); + catch + a=0; + end + else + omaFreqs(:,pointer) = ones(npops,1); + end + pointer = pointer+1; + end +end + + +%--------------------------------------------------------------------------- + + +function loggis = computeIndLogml(omaFreqs, osuusTaulu) +% Palauttaa yksilön logml:n, kun oletetaan yksilön alkuperät +% määritellyiksi kuten osuusTaulu:ssa. + +apu = repmat(osuusTaulu', [1 size(omaFreqs,2)]); +apu = apu .* omaFreqs; +apu = sum(apu); + +apu = log(apu); + +loggis = sum(apu); + + +%-------------------------------------------------------------------------- + + +function osuusTaulu = suoritaMuutos(osuusTaulu, osuus, indeksi) +% Päivittää osuusTaulun muutoksen jälkeen. + +global COUNTS; +npops = size(COUNTS,3); + +i1 = rem(indeksi,npops); +if i1==0, i1 = npops; end; +i2 = ceil(indeksi / npops); + +osuusTaulu(i1) = osuusTaulu(i1)-osuus; +osuusTaulu(i2) = osuusTaulu(i2)+osuus; + + +%------------------------------------------------------------------------- + + +function [osuusTaulu, logml] = etsiParas(osuus, osuusTaulu, omaFreqs, logml) + +ready = 0; +while ready ~= 1 + muutokset = laskeMuutokset4(osuus, osuusTaulu, omaFreqs, logml); + [maxMuutos, indeksi] = max(muutokset(1:end)); + if maxMuutos>0 + osuusTaulu = suoritaMuutos(osuusTaulu, osuus, indeksi); + logml = logml + maxMuutos; + else + ready = 1; + end +end + + + +%--------------------------------------------------------------------------- + + +function muutokset = laskeMuutokset4(osuus, osuusTaulu, omaFreqs, logml) +% Palauttaa npops*npops taulun, jonka alkio (i,j) kertoo, mik?on +% muutos logml:ss? mikäli populaatiosta i siirretään osuuden verran +% todennäköisyysmassaa populaatioon j. Mikäli populaatiossa i ei ole +% mitään siirrettävää, on vastaavassa kohdassa rivi nollia. + +global COUNTS; +npops = size(COUNTS,3); + +notEmpty = find(osuusTaulu>0.005); +muutokset = zeros(npops); +empties = ~notEmpty; + +for i1=notEmpty + + osuusTaulu(i1) = osuusTaulu(i1)-osuus; + + for i2 = [1:i1-1 i1+1:npops] + osuusTaulu(i2) = osuusTaulu(i2)+osuus; + loggis = computeIndLogml(omaFreqs, osuusTaulu); + muutokset(i1,i2) = loggis-logml; + osuusTaulu(i2) = osuusTaulu(i2)-osuus; + end + + osuusTaulu(i1) = osuusTaulu(i1)+osuus; +end + + + + +function g=randga(a,b) +flag = 0; +if a>1 +c1 = a-1; c2 = (a-(1/(6*a)))/c1; c3 = 2/c1; c4 = c3+2; c5 = 1/sqrt(a); +U1=-1; +while flag == 0, +if a<=2.5, +U1=rand;U2=rand; +else +while ~(U1>0 & U1<1), +U1=rand;U2=rand; +U1 = U2 + c5*(1-1.86*U1); +end %while +end %if +W = c2*U2/U1; +if c3*U1+W+(1/W)<=c4, +flag = 1; +g = c1*W/b; +elseif c3*log(U1)-log(W)+W<1, +flag = 1; +g = c1*W/b; +else +U1=-1; +end %if +end %while flag +elseif a==1 +g=sum(-(1/b)*log(rand(a,1))); +else +while flag == 0, +U = rand(2,1); +if U(1)>exp(1)/(a+exp(1)), +g = -log(((a+exp(1))*(1-U(1)))/(a*exp(1))); +if U(2)<=g^(a-1), +flag = 1; +end %if +else +g = ((a+exp(1))*U(1)/((exp(1))^(1/a))); +if U(2)<=exp(-g), +flag = 1; +end %if +end %if +end %while flag +g=g/b; +end %if; + + +%------------------------------------------------- + +function svar=randdir(counts,nc) +% Käyttöesim randdir([10;30;60],3) + +svar=zeros(nc,1); +for i=1:nc + svar(i,1)=randga(counts(i,1),1); +end +svar=svar/sum(svar); + +%------------------------------------------------------------------------------------- + + +function rows = computeRows(rowsFromInd, inds, ninds) +% Individuals inds have been given. The function returns a vector, +% containing the indices of the rows, which contain data from the +% individuals. + +rows = inds(:, ones(1,rowsFromInd)); +rows = rows*rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [ninds 1]); +rows = rows - miinus; +rows = reshape(rows', [1,rowsFromInd*ninds]); + +%-------------------------------------------------------------------------- +%----- + +function str = ownNum2Str(number) + +absolute = abs(number); + +if absolute < 1000 + str = num2str(number); +elseif absolute < 10000000 + first_three = rem(number,1000); + next_four = (number - first_three) /1000; + first_three = abs(first_three); + if first_three<10 + first_three = ['00' num2str(first_three)]; + elseif first_three<100 + first_three = ['0' num2str(first_three)]; + else + first_three = num2str(first_three); + end; + str = [num2str(next_four) first_three]; +elseif absolute < 100000000 + first_four = rem(number,10000); + next_four = (number - first_four) /10000; + first_four = abs(first_four); + if first_four<10 + first_four = ['000' num2str(first_four)]; + elseif first_four<100 + first_four = ['00' num2str(first_four)]; + elseif first_four<1000 + first_four = ['0' num2str(first_four)]; + else + first_four = num2str(first_four); + end; + str = [num2str(next_four) first_four]; +else + str = num2str(number); +end; + +%------------------------------------------------ + + +function part = learn_partition_modified(ordered) +% This function is called only if some individual has less than 90 per cent +% non-missing data. The function uses fuzzy clustering for the "non-missingness" +% values, finding maximum three clusters. If two of the found clusters are such +% that all the values are >0.9, then those two are further combined. + +part = learn_simple_partition(ordered,0.05); +nclust = length(unique(part)); +if nclust==3 + mini_1 = min(ordered(find(part==1))); + mini_2 = min(ordered(find(part==2))); + mini_3 = min(ordered(find(part==3))); + + if mini_1>0.9 & mini_2>0.9 + part(find(part==2)) = 1; + part(find(part==3)) = 2; + + elseif mini_1>0.9 & mini_3>0.9 + part(find(part==3)) = 1; + + elseif mini_2>0.9 & mini_3>0.9 + % This is the one happening in practice, since the values are + % ordered, leading to mini_1 <= mini_2 <= mini_3 + part(find(part==3)) = 2; + end +end \ No newline at end of file diff --git a/matlab/parallel/compare.m b/matlab/parallel/compare.m new file mode 100644 index 0000000..65958ca --- /dev/null +++ b/matlab/parallel/compare.m @@ -0,0 +1,81 @@ +function compare(varargin) +% COMPARE compares the results from multiple runs. +% input: is a group of result .mat files on the same data. + +% Example: compare('e:\data\result1.mat','e:\data\result2.mat',...) +% or call it from the BAPS menu. + +if nargin == 1 + error('number of input arguments must be >=2'); +end + +if nargin == 0 + out = uipickfiles('FilterSpec','*.mat',... + 'Prompt','Select mixture results: be sure that the underlying data and models are consistent.'); + if isnumeric(out) + return + end + nfiles = length(out); + filesin = out; + +else + nfiles = nargin; + filesin = varargin; +end + +display('---------------------------------------------------'); +fprintf(1,'Comparing results ...\n'); +if nfiles == 1 + disp('*** ERROR: Too few files.'); + return +end +for i = 1:nfiles + struct_array = load(filesin{i}); + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + clear struct_array; + if ~isfield(c,'PARTITION') || ~isfield(c,'rowsFromInd') || strcmp(c.mixtureType,'admix') + fprintf(1,'*** ERROR: Incorrect mixture result in file %d\n',i ); + return + end + elseif isfield(struct_array,'PARTITION') %Mideva versio + c = struct_array; + if ~isfield(c,'rowsFromInd') + fprintf(1,'*** ERROR: Incorrect mixture result in file %d\n',i ); + return + end + else + fprintf(1,'*** ERROR: Incorrect mixture result in file %d\n',i ); + return; + end + + try + partitionMat(i,:) = sort_partition(c.PARTITION); + catch + error('*** ERROR: inconsistent results.'); + end + mixtureType{i} = c.mixtureType; + logml(i) = c.logml; + clear c; +end + +len_mixture = length(mixtureType{1}); +for i = 2:nfiles + if len_mixture ~= length(mixtureType{i}); + error('*** ERROR: inconsistent mixture types.'); + end +end + + +% Find the best partition +best = logical(logml == max(logml)); +[uniquepartition, ind1, ind2] = unique(partitionMat(best,:), 'rows'); +fprintf(1,'Best partition was found at ''%s''\n',filesin{best}); + + + + + + + + diff --git a/matlab/parallel/compare_admix.m b/matlab/parallel/compare_admix.m new file mode 100644 index 0000000..defd0bc --- /dev/null +++ b/matlab/parallel/compare_admix.m @@ -0,0 +1,142 @@ +function compare_admix(varargin) +% COMPARE compares the results from multiple runs for admixture results +% input: is a group of result .mat files on the same data. + +% Example: compare('e:\data\result1.mat','e:\data\result2.mat',...) +% or call it from the BAPS menu. + +if nargin == 1 + error('number of input arguments must be >=2'); +end + +if nargin == 0 + out = uipickfiles('FilterSpec','*.mat',... + 'Prompt','Select admixture results: be sure that the underlying data and parameters are consistent.'); + if isnumeric(out) + return + end + nfiles = length(out); + filesin = out; + +else + nfiles = nargin; + filesin = varargin; +end + +display('---------------------------------------------------'); +fprintf(1,'Comparing results ...\n'); +minsize = zeros(nfiles,1); +iters = zeros(nfiles,1); +refInds = zeros(nfiles,1); +refIters = zeros(nfiles,1); +prop = cell(nfiles,1); + +clusters = cell(nfiles,1); +adjprior = []; + +if nfiles == 1 + disp('*** ERROR: Too few files.'); + return +end + +% read admixture files +for i = 1:nfiles + struct_array = load(filesin{i}); + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + clear struct_array; + if ~isfield(c,'PARTITION') || ~isfield(c,'rowsFromInd') ... + || ~isfield(c,'proportionsIt') + fprintf(1,'*** ERROR: Incorrect admixture result in file %d\n',i ); + return + end + elseif isfield(struct_array,'PARTITION') %Mideva versio + c = struct_array; + if ~isfield(c,'rowsFromInd') + fprintf(1,'*** ERROR: Incorrect admixture result in file %d\n',i ); + return + end + else + fprintf(1,'*** ERROR: Incorrect admixture result in file %d\n',i ); + return; + end + + prop{i} = c.proportionsIt; + pvalue(:,i) = c.pvalue; + clusters{i} = c.clusters; + popnames = c.popnames; + + % parameters + minsize(i) = c.minsize; + iters(i) = c.iters; + refInds(i) = c.refInds; + refIters(i) = c.refIters; + + if i==1 + adjprior = c.adjprior; + else + if ~isequal(adjprior,c.adjprior) + disp('*** ERROR: incosistent admixture results.'); + return + end + end + + clear c; +end + +if length(unique(minsize))~=1 || length(unique(iters))~=1 ... + || length(unique(refInds))~=1 || length(unique(refIters))~=1 + disp('*** ERROR: inconsistent admixture parameters.'); + return +end + +% now combine the results +prop_combine = prop{1}; +[ninds npops] = size(prop_combine); +[pvalue_combine,index] = min(pvalue,[],2); +for i = 1:ninds +prop_combine(i,:) = prop{index(i)}(i,:); +end + +% display the results +tulostaAdmixtureTiedot(prop_combine, pvalue_combine, minsize(1), iters(1)); + +viewPartition(prop_combine, popnames); + +% save the results +talle = questdlg(['Do you want to save the combined admixture results?'], ... + 'Save results?','Yes','No','Yes'); +if isequal(talle,'Yes') + %waitALittle; + [filename, pathname] = uiputfile('*.mat','Save results as'); + + if (filename == 0) & (pathname == 0) + % Cancel was pressed + return + else % copy 'baps4_output.baps' into the text file with the same name. + if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',[pathname filename '.txt']) + delete('baps4_output.baps') + end + end + + struct_array = load(filesin{1}); + c = struct_array.c; + clear struct_array; + c.proportionsIt = prop_combine; + c.pvalue = pvalue_combine; % Added by Jing + + fprintf(1, 'Saving the results...'); +% save([pathname filename], 'c'); + save([pathname filename], 'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + fprintf(1,'finished.\'); + +end + + + + + + + + diff --git a/matlab/parallel/dispLine.m b/matlab/parallel/dispLine.m new file mode 100644 index 0000000..c7679e5 --- /dev/null +++ b/matlab/parallel/dispLine.m @@ -0,0 +1,3 @@ +%------------------------------------------------------ +function dispLine; +disp('---------------------------------------------------'); \ No newline at end of file diff --git a/matlab/parallel/greedyPopMix_parallel.m b/matlab/parallel/greedyPopMix_parallel.m new file mode 100644 index 0000000..5f8d8d2 --- /dev/null +++ b/matlab/parallel/greedyPopMix_parallel.m @@ -0,0 +1,1622 @@ +function greedyPopMix_parallel(options) + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +clearGlobalVars; + +% LASKENNAN ALKUARVOJEN MÄÄRITTÄMINEN +outp = [options.outputMat '.txt']; +inp = options.dataFile; + +if isequal(options.dataType,'numeric') %Raakadata + data = load(options.dataFile); + ninds = testaaOnkoKunnollinenBapsData(data); %TESTAUS + if (ninds==0) + disp('*** ERROR: Incorrect Data-file.'); + return; + end + [data, rows, alleleCodes, noalle, adjprior, priorTerm] = handlePopData(data); + rowsFromInd = 0; %Ei tiedet? + + if ~isempty(options.groupname) + popnames = initPopNames(options.groupname); + if (size(popnames,1)~=ninds) + disp('*** ERROR: Incorrect name-file.'); + popnames = []; + end + else + popnames = []; + end + +elseif isequal(options.dataType,'genepop') + kunnossa = testaaGenePopData(options.dataFile); + if kunnossa==0 + return + end + + [data, popnames]=lueGenePopDataPop(options.dataType); + [data, rows, alleleCodes, noalle, adjprior, priorTerm] = handlePopData(data); + rowsFromInd = 2; %Tiedetään GenePop:in tapauksessa. + +end + +if ~isequal(options.dataType, 'matlab') + a_data = data(:,1:end-1); + + npops = size(rows,1); + PARTITION = 1:npops'; %Jokainen "yksil? eli populaatio on oma ryhmäns? + [sumcounts, counts, logml] = ... + initialPopCounts(a_data, npops, rows, noalle, adjprior); + COUNTS = counts; SUMCOUNTS = sumcounts; + POP_LOGML = computePopulationLogml(1:npops, adjprior, priorTerm); + + clear('counts', 'sumcounts','pathname','filename','vast2',... + 'vast3','vast4'); + [Z,dist] = getPopDistancesByKL(adjprior); %Saadaan COUNTS:in avulla. + +% save_preproc = questdlg('Do you wish to save pre-processed data?',... +% 'Save pre-processed data?',... +% 'Yes','No','Yes'); +% if isequal(save_preproc,'Yes'); +% waitALittle; +% [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); +% kokonimi = [pathname filename]; +% c.data = data; c.rows = rows; c.alleleCodes = alleleCodes; +% c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; +% c.dist = dist; c.Z = Z; c.popnames = popnames; c.rowsFromInd = rowsFromInd; +% c.npops = npops; c.logml = logml; +% save(kokonimi,'c'); +% clear c; +% end; +end + +if isequal(options.dataType, 'matlab') + struct_array = load(options.dataFile); + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'rows') + disp('Incorrect file format'); + return + end + elseif isfield(struct_array,'rows') %Mideva versio + c = struct_array; + else + disp('*** ERROR: Incorrect file format'); + return; + end + data = double(c.data); rows = c.rows; alleleCodes = c.alleleCodes; + noalle = c.noalle; adjprior = c.adjprior; priorTerm = c.priorTerm; + dist = c.dist; Z = c.Z; popnames = c.popnames; rowsFromInd = c.rowsFromInd; + clear c; +end + +if strcmp(options.fixedK, 'yes') + fixedK = 1; +else + fixedK = 0; +end + +npopstext = []; +npopstextExtra = options.initialK; +if length(npopstextExtra)>=255 + npopstextExtra = npopstextExtra(1:255); + npopstext = [npopstext ' ' npopstextExtra]; + teksti = 'The input field length limit (255 characters) was reached. Input more values: '; +else + % ----------------------------------------------------- + % Set the limit of the input value. + % Modified by Jing Tang, 30.12.2005 + if max(npopstextExtra) > size(data,1) + error('Initial K larger than the sample size are not accepted. '); + else + npopstext = [npopstext ' ' num2str(npopstextExtra)]; + end +end + +clear teksti; +if isempty(npopstext) || length(npopstext)==1 + return +else + npopsTaulu = str2num(npopstext); + ykkoset = find(npopsTaulu==1); + npopsTaulu(ykkoset) = []; % Mikäli ykkösi?annettu ylärajaksi, ne poistetaan. + if isempty(npopsTaulu) + return + end + clear ykkoset; +end + + +c.data=data; c.rows = rows; c.alleleCodes = alleleCodes; +c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; +c.dist=dist; c.Z=Z; c.rowsFromInd = rowsFromInd; + +if fixedK + % Only the first value of npopsTaulu is used + npops = npopsTaulu(1); + nruns = length(npopsTaulu); + [logml, npops, partitionSummary]=indMix_fixK(c,npops,nruns,1); +else + [logml, npops, partitionSummary]=indMix(c,npopsTaulu,1); +end + +if logml==1 + return +end + +data = data(:,1:end-1); + +viewPopMixPartition(PARTITION, rows, popnames); +%npops = poistaTyhjatPopulaatiot(npops); +%POP_LOGML = computePopulationLogml(1:npops, adjprior, priorTerm); + +% h0 = findobj('Tag','filename1_text'); inp = get(h0,'String'); +% h0 = findobj('Tag','filename2_text'); +% outp = get(h0,'String'); +changesInLogml = writeMixtureInfoPop(logml, rows, data, adjprior, priorTerm, ... + outp,inp,partitionSummary, popnames, fixedK); + +if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',outp) + delete('baps4_output.baps') +end + +if rowsFromInd==0 + %Käytettiin BAPS-formaattia, eik?rowsFromInd ole tunnettu. + [popnames, rowsFromInd] = findOutRowsFromInd(popnames, rows); +end + +groupPartition = PARTITION; + +fiksaaPartitioYksiloTasolle(rows, rowsFromInd); +% The logml is saved for parallel computing +c.logml = logml; +c.PARTITION = PARTITION; c.COUNTS = COUNTS; c.SUMCOUNTS = SUMCOUNTS; +c.alleleCodes = alleleCodes; c.adjprior = adjprior; +c.rowsFromInd = rowsFromInd; c.popnames = popnames; +c.data = data; c.npops = npops; c.noalle = noalle; +c.mixtureType = 'popMix'; c.groupPartition = groupPartition; +c.rows = rows; c.changesInLogml = changesInLogml; +fprintf(1,'Saving the result...') +try +% save(options.outputMat, 'c'); + save(options.outputMat, 'c', '-v7.3'); % added by Lu Cheng, 08.06.2012 + fprintf(1,'Finished.\n'); +catch + display('*** ERROR in saving the result.'); +end + + +% ------------------------------------------------------------------------- +% - Subfunctions +% ------------------------------------------------------------------------- + +function [newData, rows, alleleCodes, noalle, adjprior, priorTerm] = handlePopData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? +% kyseinen rivi on peräisin. Funktio muuttaa alleelikoodit +% siten, ett?yhden lokuksen j koodit saavat arvoja +% välill?1,...,noalle(j). Ennen tät?muutosta alleeli, jonka +% koodi on nolla muutetaan. + + +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +dataApu = []; nollat = []; isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +%rows = cell(nind,1); +rows = zeros(nind,2); +for i=1:nind + rivit = find(data(:,end)==i)'; + rows(i,1) = min(rivit); + rows(i,2) = max(rivit); +end +newData = data; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + + +%---------------------------------------------------------------- + + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; + + +%-------------------------------------------------------------------- + +function [Z,distances] = getPopDistancesByKL(adjprior) +% Laskee populaatioille etäisyydet +% käyttäen KL-divergenssi? +global COUNTS; +maxnoalle = size(COUNTS,1); +nloci = size(COUNTS,2); +npops = size(COUNTS,3); +distances = zeros(nchoosek(npops,2),1); + +d = zeros(maxnoalle, nloci, npops); +prior = adjprior; +prior(find(prior==1))=0; +nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. +prior(1,nollia)=1; +for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); +end +pointer = 1; +for pop1 = 1:npops-1 + for pop2 = pop1+1:npops + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + distances(pointer) = div; + pointer = pointer+1; + end +end +Z=linkage(distances'); + +%-------------------------------------------------------------------------- + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + +%----------------------------------------------------------------------- + + +function [sumcounts, counts, logml] = ... + initialPopCounts(data, npops, rows, noalle, adjprior) + +nloci=size(data,2); +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); + +for i=1:npops + for j=1:nloci + i_rivit = rows(i,1):rows(i,2); + havainnotLokuksessa = find(data(i_rivit,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(i_rivit,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +logml = laskeLoggis(counts, sumcounts, adjprior); + + +%----------------------------------------------------------------------- + + +function loggis = laskeLoggis(counts, sumcounts, adjprior) +npops = size(counts,3); + +logml2 = sum(sum(sum(gammaln(counts+repmat(adjprior,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior))) - ... + sum(sum(gammaln(1+sumcounts))); +loggis = logml2; + + +%-------------------------------------------------------------------- + + +function kunnossa = testaaGenePopData(tiedostonNimi) +% kunnossa == 0, jos data ei ole kelvollinen genePop data. +% Muussa tapauksessa kunnossa == 1. + +kunnossa = 0; +fid = fopen(tiedostonNimi); +line1 = fgetl(fid); %ensimmäinen rivi +line2 = fgetl(fid); %toinen rivi +line3 = fgetl(fid); %kolmas + +if (isequal(line1,-1) | isequal(line2,-1) | isequal(line3,-1)) + disp('Incorrect file format'); fclose(fid); + return +end +if (testaaPop(line1)==1 | testaaPop(line2)==1) + disp('Incorrect file format'); fclose(fid); + return +end +if testaaPop(line3)==1 + %2 rivi tällöin lokusrivi + nloci = rivinSisaltamienMjonojenLkm(line2); + line4 = fgetl(fid); + if isequal(line4,-1) + disp('Incorrect file format'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin nelj?täytyy sisältää pilkku. + disp('Incorrect file format'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy + pointer = pointer+1; + end + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format'); fclose(fid); + return + end +else + line = fgetl(fid); + lineNumb = 4; + while (testaaPop(line)~=1 & ~isequal(line,-1)) + line = fgetl(fid); + lineNumb = lineNumb+1; + end + if isequal(line,-1) + disp('Incorrect file format'); fclose(fid); + return + end + nloci = lineNumb-2; + line4 = fgetl(fid); %Eka rivi pop sanan jälkeen + if isequal(line4,-1) + disp('Incorrect file format'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin täytyy sisältää pilkku. + disp('Incorrect file format'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy. + pointer = pointer+1; + end + + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format'); fclose(fid); + return + end +end +kunnossa = 1; +fclose(fid); + +%-------------------------------------------------------------------- + + +function [data, popnames] = lueGenePopDataPop(tiedostonNimi) +% Data annetaan muodossa, jossa viimeinen sarake kertoo ryhmän. +% popnames on kuten ennenkin. + +fid = fopen(tiedostonNimi); +line = fgetl(fid); %ensimmäinen rivi +line = fgetl(fid); %toinen rivi +count = rivinSisaltamienMjonojenLkm(line); + +line = fgetl(fid); +lokusRiveja = 1; +while (testaaPop(line)==0) + lokusRiveja = lokusRiveja+1; + line = fgetl(fid); +end + +if lokusRiveja>1 + nloci = lokusRiveja; +else + nloci = count; +end + +popnames = cell(10,2); +data = zeros(100, nloci+1); +nimienLkm=0; +ninds=0; +poimiNimi=1; +digitFormat = -1; +while line ~= -1 + line = fgetl(fid); + + if poimiNimi==1 + %Edellinen rivi oli 'pop' + nimienLkm = nimienLkm+1; + ninds = ninds+1; + if nimienLkm>size(popnames,1); + popnames = [popnames; cell(10,2)]; + end + nimi = lueNimi(line); + if digitFormat == -1 + digitFormat = selvitaDigitFormat(line); + divider = 10^digitFormat; + end + popnames{nimienLkm, 1} = {nimi}; %Näin se on greedyMix:issäkin?!? + popnames{nimienLkm, 2} = ninds; + poimiNimi=0; + + data = addAlleles(data, ninds, line, divider); + + elseif testaaPop(line) + poimiNimi = 1; + + elseif line ~= -1 + ninds = ninds+1; + data = addAlleles(data, ninds, line, divider); + end +end + +fclose(fid); +data = data(1:ninds*2,:); +popnames = popnames(1:nimienLkm,:); +npops = size(popnames,1); +ind = 1; +for pop = 1:npops + if pop=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + +%------------------------------------------------------------------------ + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, diffInCounts, ... + adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun yksil?ind +% on siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%-------------------------------------------------------------------------- +%-- + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2( ... + i1, globalRows, data, adjprior, priorTerm); +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli korin i1 kaikki yksilöt siirretään +% koriin i. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +i1_logml = POP_LOGML(i1); + +inds = find(PARTITION==i1); +ninds = length(inds); + +if ninds==0 + diffInCounts = zeros(size(COUNTS,1), size(COUNTS,2)); + return; +end + +rows = []; +for i = 1:ninds + ind = inds(i); + lisa = globalRows(ind,1):globalRows(ind,2); + rows = [rows; lisa']; + %rows = [rows; globalRows{ind}']; +end + +diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); +new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2( ... + i1, i2, diffInCounts, adjprior, priorTerm); +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksilöt siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2, adjprior, priorTerm); + + +%-------------------------------------------------------------------------- +%---- + +function muutokset = laskeMuutokset3(T2, inds2, globalRows, ... + data, adjprior, priorTerm, i1) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mik?olisi muutos logml:ss? jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirretään koriin j. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +i1_logml = POP_LOGML(i1); +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = []; + for i = 1:ninds + ind = inds(i); + lisa = globalRows(ind,1):globalRows(ind,2); + rows = [rows; lisa']; + %rows = [rows; globalRows{ind}']; + end + diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); + new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm)'; + COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + + muutokset(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + +%------------------------------------------------------------------------------------ + +function muutokset = laskeMuutokset5(inds, globalRows, data, adjprior, ... + priorTerm, i1, i2) + +% Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?i vaihtaisi koria i1:n ja i2:n välill? + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; + +ninds = length(inds); +muutokset = zeros(ninds,1); + +i1_logml = POP_LOGML(i1); +i2_logml = POP_LOGML(i2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %mist? + pop2 = i2; %mihin + else + pop1 = i2; + pop2 = i1; + end + rows = globalRows(ind,1):globalRows(ind,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)-diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts; + + new_logmls = computePopulationLogml([i1 i2], adjprior, priorTerm); + muutokset(i) = sum(new_logmls); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; +end + +muutokset = muutokset - i1_logml - i2_logml; + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, i2); +% Suorittaa globaalien muuttujien päivitykset, kun yksilöt 'muuttuvat' +% siirretään koriin i2. Ennen siirtoa yksilöiden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%---------------------------------------------------------------------------- + + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäär? + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + + +%----------------------------------------------------------------------------------- + + +function npops = poistaTyhjatPopulaatiot(npops) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; + +notEmpty = find(any(SUMCOUNTS,2)); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile) + +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) & ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = 0; +end + + +%------------------------------------------------------------------------- + + +function [popnames2, rowsFromInd] = findOutRowsFromInd(popnames, rows) + +ploidisuus = questdlg('Specify the type of individuals in the data: ',... + 'Individual type?', 'Haploid', 'Diploid', 'Tetraploid', ... + 'Diploid'); + +switch ploidisuus +case 'Haploid' + rowsFromInd = 1; +case 'Diploid' + rowsFromInd = 2; +case 'Tetraploid' + rowsFromInd = 4; +end + +if ~isempty(popnames) + for i = 1:size(rows,1) + popnames2{i,1} = popnames{i,1}; + rivi = rows(i,1):rows(i,2); + popnames2{i,2} = (rivi(rowsFromInd))/rowsFromInd; + end +else + popnames2 = []; +end + +%------------------------------------------------------------------ + +function fiksaaPartitioYksiloTasolle(rows, rowsFromInd) + +global PARTITION; +totalRows = 0; +for ind = 1:size(rows,1) + totalRows = totalRows + (rows(ind,2)-rows(ind,1)+1); +end +partitio2 = zeros(totalRows/rowsFromInd,1); + +for ind = 1:size(rows,1) + kaikkiRivit = rows(ind,1):rows(ind,2); + for riviNumero = rowsFromInd:rowsFromInd:length(kaikkiRivit) + %for riviNumero = rowsFromInd:rowsFromInd:length(rows{ind}) + %rivi = rows{ind}(riviNumero); + rivi = kaikkiRivit(riviNumero); + partitio2(rivi/rowsFromInd) = PARTITION(ind); + end +end +PARTITION = partitio2; + +%--------------------------------------------------------------- + + +%-------------------------------------------------------------------- + + +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) & n0 + fid = fopen(outPutFile,'a'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF GROUP LEVEL MIXTURE ANALYSIS:'); +disp(['Data file: ' inputFile]); +disp(['Number of clustered groups: ' ownNum2Str(ninds)]); +disp(['Number of clusters in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%s \n', ['RESULTS OF GROUP LEVEL MIXTURE ANALYSIS:']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Data file: ' inputFile]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clustered groups: ' ownNum2Str(ninds)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clusters in optimal partition: ' ownNum2Str(npops)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); fprintf(fid,'\n'); + fprintf(fid,'\n'); +end + +cluster_count = length(unique(PARTITION)); +disp(['Best Partition: ']); +if (fid ~= -1) + fprintf(fid,'%s \n',['Best Partition: ']); fprintf(fid,'\n'); +end +for m=1:cluster_count + indsInM = find(PARTITION==m); + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + if names + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + else + text = ['Cluster ' num2str(m) ': {' num2str(indsInM(1))]; + for k = 2:cluster_size + text = [text ', ' num2str(indsInM(k))]; + end; + end + text = [text '}']; + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',[new_line]); + fprintf(fid,'\n'); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',[text]); + fprintf(fid,'\n'); + end + end; +end + +if npops > 1 + disp(' '); + disp(' '); + disp('Changes in log(marginal likelihood) if group i is moved to cluster j:'); + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Changes in log(marginal likelihood) if group i is moved to cluster j:']); %fprintf(fid, '\n'); + end + + if names + nameSizes = zeros(ninds,1); + for i = 1:ninds + nimi = char(popnames{i}); + nameSizes(i) = length(nimi); + end + maxSize = max(nameSizes); + maxSize = max(maxSize, 5); + erotus = maxSize - 5; + alku = blanks(erotus); + ekarivi = [alku 'group' blanks(6+erotus)]; + else + ekarivi = 'group '; + end + for i = 1:cluster_count + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; + end + disp(ekarivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); %fprintf(fid, '\n'); + end + + changesInLogml = LOGDIFF'; + for ind = 1:ninds + %[muutokset, diffInCounts] = laskeMuutokset(ind, rows, data, ... + % adjprior, priorTerm); + muutokset = changesInLogml(:,ind); + if names + nimi = char(popnames{ind}); + rivi = [blanks(maxSize - length(nimi)) nimi ':']; + else + rivi = [blanks(4-floor(log10(ind))) ownNum2Str(ind) ':']; + end + for j = 1:npops + rivi = [rivi ' ' logml2String(omaRound(muutokset(j)))]; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + end + end + + disp(' '); disp(' '); + disp('KL-divergence matrix in PHYLIP format:'); + dist_mat = zeros(npops, npops); + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['KL-divergence matrix in PHYLIP format:']); %fprintf(fid, '\n'); + end + + maxnoalle = size(COUNTS,1); + nloci = size(COUNTS,2); + d = zeros(maxnoalle, nloci, npops); + prior = adjprior; + prior(find(prior==1))=0; + nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. + prior(1,nollia)=1; + for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); + end +% ekarivi = blanks(7); +% for pop = 1:npops +% ekarivi = [ekarivi num2str(pop) blanks(7-floor(log10(pop)))]; +% end + ekarivi = num2str(npops); + disp(ekarivi); + + if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); %fprintf(fid, '\n'); + end + + for pop1 = 1:npops + rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + % rivi = [rivi kldiv2str(div) ' ']; + dist_mat(pop1,pop2) = div; + end +% disp(rivi); +% if (fid ~= -1) +% fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); +% end + end + +else + changesInLogml = []; +end + +dist_mat = dist_mat + dist_mat'; % make it symmetric +for pop1 = 1:npops + rivi = ['Cluster_' num2str(pop1) ' ']; + for pop2 = 1:npops + rivi = [rivi kldiv2str(dist_mat(pop1,pop2)) ' ']; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); %fprintf(fid, '\n'); + end +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['List of sizes of 10 best visited partitions and corresponding log(ml) values']); fprintf(fid, '\n'); +end + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +partitionSummary = partitionSummary(find(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1)) ' ' num2str(partitionSummary(part,2))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end +end + +if ~fixedK + + disp(' '); + disp(' '); + disp('Probabilities for number of clusters'); + + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Probabilities for number of clusters']); fprintf(fid, '\n'); + end + + npopsTaulu = unique(partitionSummary(:,1)); + len = length(npopsTaulu); + probs = zeros(len,1); + partitionSummary(:,2) = partitionSummary(:,2)-max(partitionSummary(:,2)); + sumtn = sum(exp(partitionSummary(:,2))); + for i=1:len + npopstn = sum(exp(partitionSummary(find(partitionSummary(:,1)==npopsTaulu(i)),2))); + probs(i) = npopstn / sumtn; + end + for i=1:len + if probs(i)>1e-5 + line = [num2str(npopsTaulu(i)) ' ' num2str(probs(i))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end + end + end +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); + +%-------------------------------------------------------------- + +function num2 = omaRound(num) +% Pyöristää luvun num 1 desimaalin tarkkuuteen +num = num*10; +num = round(num); +num2 = num/10; + +%--------------------------------------------------------- + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:in? +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmill? +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + + +%----------------------------------------------- + + +function ninds = testaaOnkoKunnollinenBapsData(data) +%Tarkastaa onko viimeisess?sarakkeessa kaikki +%luvut 1,2,...,n johonkin n:ään asti. +%Tarkastaa lisäksi, ett?on vähintään 2 saraketta. +if size(data,1)<2 + ninds = 0; return; +end +lastCol = data(:,end); +ninds = max(lastCol); +if ~isequal((1:ninds)',unique(lastCol)) + ninds = 0; return; +end + +%-------------------------------------------------------------------------- +%Seuraavat kolme funktiota liittyvat alkupartition muodostamiseen. + +function initial_partition=admixture_initialization(data_matrix,nclusters, Z) + +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +T=cluster_own(Z,nclusters); +initial_partition=zeros(size_data(1),1); +for i=1:n + kori=T(i); + here=find(data_matrix(:,end)==i); + for j=1:length(here) + initial_partition(here(j),1)=kori; + end +end + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + +%-------------------------------------------------------------------------- + +function [sumcounts, counts, logml] = ... + initialCounts(partition, data, npops, rows, noalle, adjprior) + +nloci=size(data,2); +ninds = size(rows, 1); + +%koot = rows(:,1) - rows(:,2) + 1; +%maxSize = max(koot); + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +%initializeGammaln(ninds, maxSize, max(noalle)); + +logml = laskeLoggis(counts, sumcounts, adjprior); + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + global PARTITION; + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + +%-------------------------------------------------------------------------- + +function inds = returnInOrder(inds, pop, globalRows, data, ... + adjprior, priorTerm) +% Palauttaa yksilöt järjestyksess?siten, ett?ensimmäisen?on +% se, jonka poistaminen populaatiosta pop nostaisi logml:n +% arvoa eniten. + +global COUNTS; global SUMCOUNTS; +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind =inds(i); + rows = globalRows(i,1):globalRows(i,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop) = COUNTS(:,:,pop)-diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)-diffInSumCounts; + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior, priorTerm); + COUNTS(:,:,pop) = COUNTS(:,:,pop)+diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)+diffInSumCounts; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhji? +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end diff --git a/matlab/parallel/independent_parallel.m b/matlab/parallel/independent_parallel.m new file mode 100644 index 0000000..21fc9f7 --- /dev/null +++ b/matlab/parallel/independent_parallel.m @@ -0,0 +1,1674 @@ +function independent_parallel(options) +% INDEPENDENT_PARALLEL is the command line version of the baps partition with +% independent models. +% Input: options is a struct generated by parallel.m + +%-------------------------------------------------------------------------- +%- Syntax check out +%-------------------------------------------------------------------------- +outp = [options.outputMat '.txt']; +inp = options.dataFile; + +if strcmp(options.fixedK, 'yes') + fixedK = 1; +else + fixedK = 0; +end + +%-------------------------------------------------------------------------- +%- Get data file location +%-------------------------------------------------------------------------- +switch options.dataType + case 'numeric' + %------------------------------------------------------------------ + %- Get name and index file location + %------------------------------------------------------------------ + try + data = load(options.dataFile); + catch + disp('*** ERROR: Incorrect numerical text data.'); + return + end + ninds = testaaOnkoKunnollinenBapsData(data); %TESTAUS + if (ninds==0) + disp('*** ERROR: Incorrect numerical text data.'); + return; + end + + if ~isempty(options.nameFile) && ~isempty(options.indexFile) + popnames = initPopNames(options.nameFile{1}, options.indexFile{1}); + else + popnames = []; + end + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + [Z,dist] = newGetDistances(data,rowsFromInd); + + case 'genepop' + kunnossa = testaaGenePopData(options.dataFile); + if kunnossa == 0 + return + end + [data,popnames] = lueGenePopData(options.dataFile); + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + [Z,dist] = newGetDistances(data,rowsFromInd); + case 'matlab' + struct_array = load(options.dataFile); + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'dist') + disp('*** ERROR: Incorrect matlab format'); + return + end + elseif isfield(struct_array,'dist') %Mideva versio + c = struct_array; + else + disp('*** ERROR: Incorrect matlab format'); + return; + end + data = double(c.data); rowsFromInd = c.rowsFromInd; alleleCodes = c.alleleCodes; + noalle = c.noalle; adjprior = c.adjprior; priorTerm = c.priorTerm; + dist = c.dist; popnames = c.popnames; Z = c.Z; + clear c; + otherwise + error('*** ERROR: data type is not specified or unknown.'); +end + +% --------------------------------------------------------- +% - Stochastic search algorithm +% --------------------------------------------------------- +global PARTITION; global COUNTS; +global SUMCOUNTS; global POP_LOGML; +clearGlobalVars; + +c.data=data; +c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; +c.dist=dist; c.Z=Z; c.rowsFromInd = rowsFromInd; + +ninds = length(unique(data(:,end))); +ekat = (1:rowsFromInd:ninds*rowsFromInd)'; +c.rows = [ekat ekat+rowsFromInd-1]; + +npopstext = []; +npopstextExtra = options.initialK; +if length(npopstextExtra)>=255 + npopstextExtra = npopstextExtra(1:255); + npopstext = [npopstext ' ' npopstextExtra]; + teksti = 'The input field length limit (255 characters) was reached. Input more values: '; +else + % ----------------------------------------------------- + % Set the limit of the input value. + % Modified by Jing Tang, 30.12.2005 + if max(npopstextExtra) > size(data,1) + error('Initial K larger than the sample size are not accepted. '); + else + npopstext = [npopstext ' ' num2str(npopstextExtra)]; + end +end + +clear teksti; +if isempty(npopstext) || length(npopstext)==1 + return +else + npopsTaulu = str2num(npopstext); + ykkoset = find(npopsTaulu==1); + npopsTaulu(ykkoset) = []; % Mikäli ykkösi?annettu ylärajaksi, ne poistetaan. + if isempty(npopsTaulu) + return + end + clear ykkoset; +end + +if fixedK + % Only the first value of npopsTaulu is used + npops = npopsTaulu(1); + nruns = length(npopsTaulu); + [logml, npops, partitionSummary]=indMix_fixK(c,npops,nruns,1); +else + [logml, npops, partitionSummary]=indMix(c,npopsTaulu,1); +end + +if logml==1 + return +end + +data = noIndex(data,noalle); + +changesInLogml = writeMixtureInfo(logml, rowsFromInd, data, adjprior, priorTerm, ... + outp,inp,partitionSummary, popnames, fixedK); + +viewMixPartition(PARTITION, popnames); + +if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',outp) + delete('baps4_output.baps') +end + +% The logml is saved for parallel computing +c.logml = logml; +c.changesInLogml = changesInLogml; % this variable stores the change of likelihoods. + +c.PARTITION = PARTITION; c.COUNTS = COUNTS; c.SUMCOUNTS = SUMCOUNTS; +c.alleleCodes = alleleCodes; c.adjprior = adjprior; c.popnames = popnames; +c.rowsFromInd = rowsFromInd; c.data = data; c.npops = npops; +c.noalle = noalle; c.mixtureType = 'mix'; +fprintf(1,'Saving the result...') +try +% save(options.outputMat, 'c'); + save(options.outputMat, 'c', '-v7.3'); % added by Lu Cheng, 08.06.2012 + fprintf(1,'Finished.\n'); +catch + display('*** ERROR in saving the result.'); +end + + +% --------------------------------------------------------- +% - Subfunctions +% --------------------------------------------------------- + +%------------------------------------------------------------------------------------- + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; + + +%------------------------------------------------------------------------------------- + + +function rows = computeRows(rowsFromInd, inds, ninds) +% On annettu yksilöt inds. Funktio palauttaa vektorin, joka +% sisältää niiden rivien numerot, jotka sisältävät yksilöiden +% dataa. + +rows = inds(:, ones(1,rowsFromInd)); +rows = rows*rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [ninds 1]); +rows = rows - miinus; +rows = reshape(rows', [1,rowsFromInd*ninds]); + + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. +global PARTITION; +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + + +%-------------------------------------------------------------------------- + + +function [suurin, i2] = arvoSeuraavaTila(muutokset, logml) +% Suorittaa yksilön seuraavan tilan arvonnan + +y = logml + muutokset; % siirron jälkeiset logml:t +y = y - max(y); +y = exp(y); +summa = sum(y); +y = y/summa; +y = cumsum(y); + +i2 = rand_disc(y); % uusi kori +suurin = muutokset(i2); + + +%-------------------------------------------------------------------------------------- + + +function svar=rand_disc(CDF) +%returns an index of a value from a discrete distribution using inversion method +slump=rand; +har=find(CDF>slump); +svar=har(1); + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, rowsFromInd, diffInCounts, ... + adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun yksil?ind +% on siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2( ... + i1, i2, rowsFromInd, diffInCounts, adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksilöt siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2, adjprior, priorTerm); + + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, rowsFromInd, diffInCounts, ... + adjprior, priorTerm, i2) +% Suorittaa globaalien muuttujien päivitykset, kun yksilöt 'muuttuvat' +% siirretään koriin i2. Ennen siirtoa yksilöiden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, rowsFromInd, data, adjprior, priorTerm) +% Palauttaa yksilöt järjestyksess?siten, ett?ensimmäisen?on +% se, jonka poistaminen populaatiosta pop nostaisi logml:n +% arvoa eniten. + +global COUNTS; global SUMCOUNTS; +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind = inds(i); + rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop) = COUNTS(:,:,pop)-diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)-diffInSumCounts; + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior, priorTerm); + COUNTS(:,:,pop) = COUNTS(:,:,pop)+diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)+diffInSumCounts; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = ... + laskeMuutokset(ind, rowsFromInd, data, adjprior, priorTerm) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?ind siirretään koriin i. +% diffInCounts on poistettava COUNTS:in siivusta i1 ja lisättäv? +% COUNTS:in siivuun i2, mikäli muutos toteutetaan. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +i1 = PARTITION(ind); +i1_logml = POP_LOGML(i1); + +rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); +new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2( ... + i1, rowsFromInd, data, adjprior, priorTerm) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli korin i1 kaikki yksilöt siirretään +% koriin i. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +i1_logml = POP_LOGML(i1); + +inds = find(PARTITION==i1); +ninds = length(inds); + +if ninds==0 + diffInCounts = zeros(size(COUNTS,1), size(COUNTS,2)); + return; +end + +rows = computeRows(rowsFromInd, inds, ninds); + +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); +new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm); +COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + +muutokset(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + + +%------------------------------------------------------------------------------------ + + +function muutokset = laskeMuutokset3(T2, inds2, rowsFromInd, ... + data, adjprior, priorTerm, i1) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mik?olisi muutos logml:ss? jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirretään koriin j. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +i1_logml = POP_LOGML(i1); + +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = computeRows(rowsFromInd, inds, ninds); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + COUNTS(:,:,i2) = COUNTS(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)+repmat(diffInSumCounts,[npops-1 1]); + new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm)'; + COUNTS(:,:,i2) = COUNTS(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + + muutokset(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + + +%------------------------------------------------------------------------------------ + +function muutokset = laskeMuutokset5(inds, rowsFromInd, data, adjprior, ... + priorTerm, i1, i2) + +% Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?i vaihtaisi koria i1:n ja i2:n välill? + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; + +ninds = length(inds); +muutokset = zeros(ninds,1); + +i1_logml = POP_LOGML(i1); +i2_logml = POP_LOGML(i2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %mist? + pop2 = i2; %mihin + else + pop1 = i2; + pop2 = i1; + end + rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)-diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts; + + PARTITION(ind) = pop2; + + new_logmls = computePopulationLogml([i1 i2], adjprior, priorTerm); + + muutokset(i) = sum(new_logmls); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; + + PARTITION(ind) = pop1; +end + +muutokset = muutokset - i1_logml - i2_logml; + +%-------------------------------------------------------------------------- + + + +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n +% riveill?rows. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + + + +%------------------------------------------------------------------------------------ + + +function popLogml = computePopulationLogml(pops, adjprior, priorTerm) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS(pops,:)),2) - priorTerm; + + +%----------------------------------------------------------------------------------- + + +function npops = poistaTyhjatPopulaatiot(npops) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; + +notEmpty = find(any(SUMCOUNTS,2)); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%---------------------------------------------------------------------------------- +%Seuraavat kolme funktiota liittyvat alkupartition muodostamiseen. + +function initial_partition=admixture_initialization(data_matrix,nclusters,Z) +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +T=cluster_own(Z,nclusters); +initial_partition=zeros(size_data(1),1); +for i=1:n + kori=T(i); + here=find(data_matrix(:,end)==i); + for j=1:length(here) + initial_partition(here(j),1)=kori; + end +end + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + + +%--------------------------------------------------------------------------------------- + + +function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? +% kyseinen rivi on peräisin. Funktio tutkii ensin, ett?montako +% rivi?maksimissaan on peräisin yhdelt?yksilölt? jolloin saadaan +% tietää onko kyseess?haploidi, diploidi jne... Tämän jälkeen funktio +% lisää tyhji?rivej?niille yksilöille, joilta on peräisin vähemmän +% rivej?kuin maksimimäär? +% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin +% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytöss?oleva koodi. +% Tämän jälkeen funktio muuttaa alleelikoodit siten, ett?yhden lokuksen j +% koodit saavat arvoja välill?1,...,noalle(j). +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +dataApu = []; nollat = []; isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +nrows = size(data,1); +ncols = size(data,2); +rowsFromInd = zeros(nind,1); +for i=1:nind + rowsFromInd(i) = length(find(data(:,end)==i)); +end +maxRowsFromInd = max(rowsFromInd); +a = -999; +emptyRow = repmat(a, 1, ncols); +lessThanMax = find(rowsFromInd < maxRowsFromInd); +missingRows = maxRowsFromInd*nind - nrows; +data = [data; zeros(missingRows, ncols)]; +pointer = 1; +for ind=lessThanMax' %Käy läpi ne yksilöt, joilta puuttuu rivej? + miss = maxRowsFromInd-rowsFromInd(ind); % Tält?yksilölt?puuttuvien lkm. + for j=1:miss + rowToBeAdded = emptyRow; + rowToBeAdded(end) = ind; + data(nrows+pointer, :) = rowToBeAdded; + pointer = pointer+1; + end +end +data = sortrows(data, ncols); % Sorttaa yksilöiden mukaisesti +newData = data; +rowsFromInd = maxRowsFromInd; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + + +%---------------------------------------------------------------------------------------- + + +function [Z, dist] = newGetDistances(data, rowsFromInd) + +ninds = max(data(:,end)); +nloci = size(data,2)-1; +riviLkm = nchoosek(ninds,2); + +empties = find(data<0); +data(empties)=0; +data = uint8(data); % max(noalle) oltava <256 + +pariTaulu = zeros(riviLkm,2); +aPointer=1; +for a=1:ninds-1 + pariTaulu(aPointer:aPointer+ninds-1-a,1) = ones(ninds-a,1)*a; + pariTaulu(aPointer:aPointer+ninds-1-a,2) = (a+1:ninds)'; + aPointer = aPointer+ninds-a; +end + +eka = pariTaulu(:,ones(1,rowsFromInd)); +eka = eka * rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]); +eka = eka - miinus; + +toka = pariTaulu(:,ones(1,rowsFromInd)*2); +toka = toka * rowsFromInd; +toka = toka - miinus; + +%eka = uint16(eka); +%toka = uint16(toka); + +summa = zeros(riviLkm,1); +vertailuja = zeros(riviLkm,1); + +clear pariTaulu; clear miinus; + +x = zeros(size(eka)); x = uint8(x); +y = zeros(size(toka)); y = uint8(y); + +for j=1:nloci; + + for k=1:rowsFromInd + x(:,k) = data(eka(:,k),j); + y(:,k) = data(toka(:,k),j); + end + + for a=1:rowsFromInd + for b=1:rowsFromInd + vertailutNyt = double(x(:,a)>0 & y(:,b)>0); + vertailuja = vertailuja + vertailutNyt; + lisays = (x(:,a)~=y(:,b) & vertailutNyt); + summa = summa+double(lisays); + end + end +end + +clear x; clear y; clear vertailutNyt; +nollat = find(vertailuja==0); +dist = zeros(length(vertailuja),1); +dist(nollat) = 1; +muut = find(vertailuja>0); +dist(muut) = summa(muut)./vertailuja(muut); +clear summa; clear vertailuja; + +Z = linkage(dist'); + + +%---------------------------------------------------------------------------------------- + + +function [Z, distances]= getDistances(data_matrix,nclusters) + +%finds initial admixture clustering solution with nclusters clusters, uses simple mean Hamming distance +%gives partition in 8-bit format +%allocates all alleles of a single individual into the same basket +%data_matrix contains #Loci+1 columns, last column indicate whose alleles are placed in each row, +%i.e. ranges from 1 to #individuals. For diploids there are 2 rows per individual, for haploids only a single row +%missing values are indicated by zeros in the partition and by negative integers in the data_matrix. + +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +distances=zeros(nchoosek(n,2),1); +pointer=1; +for i=1:n-1 + i_data=data_matrix(find(data_matrix(:,end)==i),1:nloci); + for j=i+1:n + d_ij=0; + j_data=data_matrix(find(data_matrix(:,end)==j),1:nloci); + vertailuja = 0; + for k=1:size(i_data,1) + for l=1:size(j_data,1) + here_i=find(i_data(k,:)>=0); + here_j=find(j_data(l,:)>=0); + here_joint=intersect(here_i,here_j); + vertailuja = vertailuja + length(here_joint); + d_ij = d_ij + length(find(i_data(k,here_joint)~=j_data(l,here_joint))); + end + end + d_ij = d_ij / vertailuja; + distances(pointer)=d_ij; + pointer=pointer+1; + end +end + +Z=linkage(distances'); + + + +%---------------------------------------------------------------------------------------- + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 || m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile, indexFile) +%Palauttaa tyhjän, mikäli nimitiedosto ja indeksitiedosto +% eivät olleet yht?pitki? + +popnames = []; +indices = load(indexFile); + +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) && ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +if length(names) ~= length(indices) + disp('The number of population names must be equal to the number of '); + disp('entries in the file specifying indices of the first individuals of '); + disp('each population.'); + return; +end + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = indices(i); +end + + +%----------------------------------------------------------------------------------- + + +function [sumcounts, counts, logml] = ... + initialCounts(partition, data, npops, rowsFromInd, noalle) + +nloci=size(data,2); +ninds = size(data,1)/rowsFromInd; + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +initializeGammaln(ninds, rowsFromInd, max(noalle)); + +logml = computeLogml(counts, sumcounts, noalle, data, rowsFromInd); + + +%----------------------------------------------------------------------- + + +function logml=computeLogml(counts, sumcounts, noalle, data, rowsFromInd) +nloci = size(counts,2); +npops = size(counts,3); +adjnoalle = zeros(max(noalle),nloci); +for j=1:nloci + adjnoalle(1:noalle(j),j)=noalle(j); + if noalle(j)0 + fid = fopen(outPutFile,'w'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:'); +disp(['Data file: ' inputFile]); +disp(['Model: independent']); +disp(['Number of clustered individuals: ' ownNum2Str(ninds)]); +disp(['Number of groups in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%s \n', ['RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Data file: ' inputFile]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clustered individuals: ' ownNum2Str(ninds)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of groups in optimal partition: ' ownNum2Str(npops)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); fprintf(fid,'\n'); +end + +cluster_count = length(unique(PARTITION)); +disp(['Best Partition: ']); +if (fid ~= -1) + fprintf(fid,'%s \n',['Best Partition: ']); fprintf(fid,'\n'); +end +for m=1:cluster_count + indsInM = find(PARTITION==m); + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + if names + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + else + text = ['Cluster ' num2str(m) ': {' num2str(indsInM(1))]; + for k = 2:cluster_size + text = [text ', ' num2str(indsInM(k))]; + end; + end + text = [text '}']; + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',[new_line]); + fprintf(fid,'\n'); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',[text]); + fprintf(fid,'\n'); + end + end; +end + +if npops > 1 + disp(' '); + disp(' '); + disp('Changes in log(marginal likelihood) if indvidual i is moved to group j:'); + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Changes in log(marginal likelihood) if indvidual i is moved to group j:']); fprintf(fid, '\n'); + end + + if names + nameSizes = zeros(ninds,1); + for i = 1:ninds + nimi = char(popnames{i}); + nameSizes(i) = length(nimi); + end + maxSize = max(nameSizes); + maxSize = max(maxSize, 5); + erotus = maxSize - 5; + alku = blanks(erotus); + ekarivi = [alku ' ind' blanks(6+erotus)]; + else + ekarivi = ' ind '; + end + for i = 1:cluster_count + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; + end + disp(ekarivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); fprintf(fid, '\n'); + end + + %ninds = size(data,1)/rowsFromInd; + changesInLogml = LOGDIFF'; + for ind = 1:ninds + %[muutokset, diffInCounts] = laskeMuutokset(ind, rowsFromInd, data, ... + % adjprior, priorTerm); + %changesInLogml(:,ind) = muutokset; + muutokset = changesInLogml(:,ind); + if names + nimi = char(popnames{ind}); + rivi = [blanks(maxSize - length(nimi)) nimi ':']; + else + rivi = [blanks(4-floor(log10(ind))) ownNum2Str(ind) ':']; + end + for j = 1:npops + rivi = [rivi ' ' logml2String(omaRound(muutokset(j)))]; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + end + end + + disp(' '); disp(' '); + disp('KL-divergence matrix in PHYLIP format:'); + + dist_mat = zeros(npops, npops); + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['KL-divergence matrix in PHYLIP format:']); %fprintf(fid, '\n'); + end + + maxnoalle = size(COUNTS,1); + nloci = size(COUNTS,2); + d = zeros(maxnoalle, nloci, npops); + prior = adjprior; + prior(find(prior==1))=0; + nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. + prior(1,nollia)=1; + for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); + end +% ekarivi = blanks(7); +% for pop = 1:npops +% ekarivi = [ekarivi num2str(pop) blanks(7-floor(log10(pop)))]; +% end + ekarivi = num2str(npops); + disp(ekarivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); %fprintf(fid, '\n'); + end + + for pop1 = 1:npops + % rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + % rivi = [rivi kldiv2str(div) ' ']; + dist_mat(pop1,pop2) = div; + end + % disp(rivi); + % if (fid ~= -1) + % fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + % end + end + +end +dist_mat = dist_mat + dist_mat'; % make it symmetric +for pop1 = 1:npops + rivi = ['Cluster_' num2str(pop1) ' ']; + for pop2 = 1:npops + rivi = [rivi kldiv2str(dist_mat(pop1,pop2)) ' ']; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); %fprintf(fid, '\n'); + end +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['List of sizes of 10 best visited partitions and corresponding log(ml) values']); fprintf(fid, '\n'); +end + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +partitionSummary = partitionSummary(find(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1)) ' ' num2str(partitionSummary(part,2))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end +end + +if ~fixedK + disp(' '); + disp(' '); + disp('Probabilities for number of clusters'); + + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Probabilities for number of clusters']); fprintf(fid, '\n'); + end + + npopsTaulu = unique(partitionSummary(:,1)); + len = length(npopsTaulu); + probs = zeros(len,1); + partitionSummary(:,2) = partitionSummary(:,2)-max(partitionSummary(:,2)); + sumtn = sum(exp(partitionSummary(:,2))); + for i=1:len + npopstn = sum(exp(partitionSummary(find(partitionSummary(:,1)==npopsTaulu(i)),2))); + probs(i) = npopstn / sumtn; + end + for i=1:len + if probs(i)>1e-5 + line = [num2str(npopsTaulu(i)) ' ' num2str(probs(i))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end + end + end +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + + +%--------------------------------------------------------------- + + +function dispLine +disp('---------------------------------------------------'); + +%-------------------------------------------------------------- + +function num2 = omaRound(num) +% Pyöristää luvun num 1 desimaalin tarkkuuteen +num = num*10; +num = round(num); +num2 = num/10; + +%--------------------------------------------------------- + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:in? +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmill? +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + +%-------------------------------------------------------------------- + + +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) && n 1)); +end + +%-------------------------------------------------------------------- + + +function kunnossa = testaaGenePopData(tiedostonNimi) +% kunnossa == 0, jos data ei ole kelvollinen genePop data. +% Muussa tapauksessa kunnossa == 1. + +kunnossa = 0; +fid = fopen(tiedostonNimi); +line1 = fgetl(fid); %ensimmäinen rivi +line2 = fgetl(fid); %toinen rivi +line3 = fgetl(fid); %kolmas + +if (isequal(line1,-1) || isequal(line2,-1) || isequal(line3,-1)) + disp('Incorrect file format 1168'); fclose(fid); + return +end +if (testaaPop(line1)==1 || testaaPop(line2)==1) + disp('Incorrect file format 1172'); fclose(fid); + return +end +if testaaPop(line3)==1 + %2 rivi tällöin lokusrivi + nloci = rivinSisaltamienMjonojenLkm(line2); + line4 = fgetl(fid); + if isequal(line4,-1) + disp('Incorrect file format 1180'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin nelj?täytyy sisältää pilkku. + disp('Incorrect file format 1185'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy + pointer = pointer+1; + end + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1195'); fclose(fid); + return + end +else + line = fgetl(fid); + lineNumb = 4; + while (testaaPop(line)~=1 && ~isequal(line,-1)) + line = fgetl(fid); + lineNumb = lineNumb+1; + end + if isequal(line,-1) + disp('Incorrect file format 1206'); fclose(fid); + return + end + nloci = lineNumb-2; + line4 = fgetl(fid); %Eka rivi pop sanan jälkeen + if isequal(line4,-1) + disp('Incorrect file format 1212'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin täytyy sisältää pilkku. + disp('Incorrect file format 1217'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy. + pointer = pointer+1; + end + + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1228'); fclose(fid); + return + end +end +kunnossa = 1; +fclose(fid); + +%------------------------------------------------------ + + +function [data, popnames] = lueGenePopData(tiedostonNimi) + +fid = fopen(tiedostonNimi); +line = fgetl(fid); %ensimmäinen rivi +line = fgetl(fid); %toinen rivi +count = rivinSisaltamienMjonojenLkm(line); + +line = fgetl(fid); +lokusRiveja = 1; +while (testaaPop(line)==0) + lokusRiveja = lokusRiveja+1; + line = fgetl(fid); +end + +if lokusRiveja>1 + nloci = lokusRiveja; +else + nloci = count; +end + +popnames = cell(10,2); +data = zeros(100, nloci+1); +nimienLkm=0; +ninds=0; +poimiNimi=1; +digitFormat = -1; +while line ~= -1 + line = fgetl(fid); + + if poimiNimi==1 + %Edellinen rivi oli 'pop' + nimienLkm = nimienLkm+1; + ninds = ninds+1; + if nimienLkm>size(popnames,1); + popnames = [popnames; cell(10,2)]; + end + nimi = lueNimi(line); + if digitFormat == -1 + digitFormat = selvitaDigitFormat(line); + divider = 10^digitFormat; + end + popnames{nimienLkm, 1} = {nimi}; %Näin se on greedyMix:issäkin?!? + popnames{nimienLkm, 2} = ninds; + poimiNimi=0; + + data = addAlleles(data, ninds, line, divider); + + elseif testaaPop(line) + poimiNimi = 1; + + elseif line ~= -1 + ninds = ninds+1; + data = addAlleles(data, ninds, line, divider); + end +end + +data = data(1:ninds*2,:); +popnames = popnames(1:nimienLkm,:); +fclose(fid); + +%-------------------------------------------------------- + + +function data = addAlleles(data, ind, line, divider) +% Lisaa BAPS-formaatissa olevaan datataulukkoon +% yksilöä ind vastaavat rivit. Yksilön alleelit +% luetaan genepop-formaatissa olevasta rivist? +% line. Jos data on 3 digit formaatissa on divider=1000. +% Jos data on 2 digit formaatissa on divider=100. + +nloci = size(data,2)-1; +if size(data,1) < 2*ind + data = [data; zeros(100,nloci+1)]; +end + +k=1; +merkki=line(k); +while ~isequal(merkki,',') + k=k+1; + merkki=line(k); +end +line = line(k+1:end); +clear k; clear merkki; + +alleeliTaulu = sscanf(line,'%d'); + +if length(alleeliTaulu)~=nloci + disp('Incorrect data format.'); +end + +for j=1:nloci + ekaAlleeli = floor(alleeliTaulu(j)/divider); + if ekaAlleeli==0 + ekaAlleeli=-999; + end + tokaAlleeli = rem(alleeliTaulu(j),divider); + if tokaAlleeli==0 + tokaAlleeli=-999; + end + + data(2*ind-1,j) = ekaAlleeli; + data(2*ind,j) = tokaAlleeli; +end + +data(2*ind-1,end) = ind; +data(2*ind,end) = ind; + +%------------------------------------------------------ + + +function count = rivinSisaltamienMjonojenLkm(line) +% Palauttaa line:n sisältämien mjonojen lukumäärän. +% Mjonojen väliss?täytyy olla välilyönti. +count = 0; +pit = length(line); +tila = 0; %0, jos odotetaan välilyöntej? 1 jos odotetaan muita merkkej? +for i=1:pit + merkki = line(i); + if (isspace(merkki) && tila==0) + %Ei tehd?mitään. + elseif (isspace(merkki) && tila==1) + tila = 0; + elseif (~isspace(merkki) && tila==0) + tila = 1; + count = count+1; + elseif (~isspace(merkki) && tila==1) + %Ei tehd?mitään + end +end + + +%------------------------------------------------------- + +function nimi = lueNimi(line) +%Palauttaa line:n alusta sen osan, joka on ennen pilkkua. +n = 1; +merkki = line(n); +nimi = ''; +while ~isequal(merkki,',') + nimi = [nimi merkki]; + n = n+1; + merkki = line(n); +end + +%------------------------------------------------------- + +function df = selvitaDigitFormat(line) +% line on ensimmäinen pop-sanan jälkeinen rivi +% Genepop-formaatissa olevasta datasta. funktio selvittää +% rivin muodon perusteella, ovatko datan alleelit annettu +% 2 vai 3 numeron avulla. + +n = 1; +merkki = line(n); +while ~isequal(merkki,',') + n = n+1; + merkki = line(n); +end + +while ~any(merkki == '0123456789'); + n = n+1; + merkki = line(n); +end +numeroja = 0; +while any(merkki == '0123456789'); + numeroja = numeroja+1; + n = n+1; + merkki = line(n); +end + +df = numeroja/2; \ No newline at end of file diff --git a/matlab/parallel/initPopNames.m b/matlab/parallel/initPopNames.m new file mode 100644 index 0000000..6b6adb1 --- /dev/null +++ b/matlab/parallel/initPopNames.m @@ -0,0 +1,35 @@ +function popnames = initPopNames(nameFile, indexFile) +%Palauttaa tyhjän, mikäli nimitiedosto ja indeksitiedosto +% eivät olleet yhtä pitkiä. + +popnames = []; +indices = load(indexFile); + +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) && ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +if length(names) ~= length(indices) + disp('The number of population names must be equal to the number of '); + disp('entries in the file specifying indices of the first individuals of '); + disp('each population.'); + return; +end + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = indices(i); +end diff --git a/matlab/parallel/linkageMixture_speed.m b/matlab/parallel/linkageMixture_speed.m new file mode 100644 index 0000000..9201aaf --- /dev/null +++ b/matlab/parallel/linkageMixture_speed.m @@ -0,0 +1,2138 @@ +function linkageMixture_speed +base = findobj('Tag','base_figure'); + + +% check whether fixed k mode is selected +h0 = findobj('Tag','fixk_menu'); +fixedK = get(h0, 'userdata'); + +if fixedK + if ~(fixKWarning == 1) % call function fixKWarning + return + end +end + +% Data handling +input_type = questdlg('Specify the format of your data: ',... + 'Specify Data Format', ... + 'MLST-format', 'BAPS-format','Pre-processed data', 'MLST-format'); + +switch input_type + + case 'MLST-format' + + %waitALittle; + mlst_type = questdlg('Choose data type: ',... + 'Specify MLST format', ... + 'Separate allelic profiles(TXT)', ... + 'Concatenate allelic sequences(EXCEL)','Separate allelic profiles(TXT)'); + switch mlst_type + case 'Concatenate allelic sequences(EXCEL)' + %waitALittle + setWindowOnTop(base,'false') + [filename,pathname] = uigetfile('*.xls','Load new concatenate sequence profile(*.xls)'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + display('---------------------------------------------------'); + display(['Reading sequence profile from: ',[pathname filename],'...']); + + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); + [data, component_mat, popnames] = processxls([pathname filename]); + + if isempty(data) + display('*** ERROR: Failed in loading the data'); + return; + end + + case 'Separate allelic profiles(TXT)' + % Ask the allelic profile + setWindowOnTop(base,'false') + [filename,pathname] = uigetfile('*.pl;*.txt','Load new allelic profile(*.pl, *.txt)'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + display('---------------------------------------------------'); + display(['Reading allelic profile from: ',[pathname filename],'...']); + + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); + + % Preprocess the profile + output = processprofile([pathname filename]); + headercount = size(output,2); + flag = zeros(1,headercount); + for i = 1:headercount + if strcmpi('ST',output{1,i}) + flag(i) = 1; + else + if strcmpi('Isolate', output{1,i}) || strcmpi('Strain',output{1,i}) + flag(i) = 2; + else + if strcmpi('Species', output{1,i}) + flag(i) = 3; + end + end + end + end + + if ~any(flag) + h = errordlg(['Loading of the specified file was unsuccessful. ' ... + 'Please see the tutorial to find out the correct file ' ... + 'format.'] ,'Error','modal'); + handle = [h,base]; + setWindowOnTop(handle,{'true','true'}) + + uiwait(h); + fprintf(1,'\n*** ERROR: Failed in loading the allelic profile.\n'); + return; + end + + index = (1:size(output,1)-1)'; + + + % Species selection + species_loc = find(flag==3); + if ~isempty(species_loc) + species_col = output((2:end), species_loc); + [species_str, m, n] = unique(species_col); + + [s1,v1] = listdlg('PromptString','Select species:',... + 'SelectionMode','multiple',... + 'Name','Select Species',... + 'ListString',species_str'); + removed = setdiff(n,s1); + + if ~v1 || isempty(s1) + dispCancel;return + end + else + n = index; + removed = 0; + end + + % Isolate/Strain selection + isolate_loc = find(flag==2); + if ~isempty(isolate_loc) + isolate_col = output((2:end), isolate_loc); + isolate_str = isolate_col(logical(~ismember(n,removed))); + index(ismember(n,removed)) = []; + + [s2,v2] = choosebox('Name','Select Isolates','PromptString',... + 'Isolates in the sample(Ctrl+A to select all):','SelectString', ... + 'Isolates you have selected','ListString', isolate_str', ... + 'InitialValue',1); + if isempty(s2) || ~v2 + dispCancel; return + end + else + % ST selection + isolate_loc = find(flag==1); + isolate_col = output((2:end), isolate_loc); + % isolate_str = isolate_col(find(~ismember(n,removed))); + isolate_str = isolate_col(logical(~ismember(n,removed))); + index(ismember(n,removed)) = []; + + [s2,v2] = choosebox('Name','Select STs','PromptString',... + 'STs in the sample (Ctrl+A to select all):','SelectString', ... + 'STs you have selected','ListString', isolate_str'); + if isempty(s2) || ~v2 + dispCancel; return + end + end + + % Read the data + isolate_index = index(s2); + if ~isempty(isolate_loc) + popnames(:,1) = isolate_str(s2); + else + popnames(:,1) = num2cell(index(s2)); + end + popnames(:,2) = num2cell([1:length(isolate_index)]'); + + % remove empty elements + strmat = output([isolate_index+1]',find(flag==0)); + [i,j] = find(cellfun('isempty',strmat)); + ij = [i j]; + for k=1:length(i) + strmat(ij(k,1),ij(k,2))={''}; + end + [i,j] = find(strcmp(strmat,'')); + ij = [i j]; + for k=1:length(i) + strmat(ij(k,1),ij(k,2))={'0'}; + end + + data_allele = zeros(size(strmat)); + for i = 1:size(strmat,1) + for j = 1:size(strmat,2) + data_allele(i,j) = str2num(char(strmat(i,j))); + end + end + + % remove columns containing empty values + realgene = find(all(data_allele)); + data_allele = data_allele(:,realgene); + partition_index = 1:size(strmat,1); + data_allele = [data_allele partition_index']; + genename = output(1,find(flag==0)); + genename = genename(realgene); + + + if isempty(genename) || isempty(data_allele) + msgbox(['Loading of the specified file was unsuccessful. ' ... + 'Please see the tutorial to find out the correct file ' ... + 'format.'] ,'Error', ... + 'error'); + fprintf(1,'\n*** ERROR: Failed in loading the allelic profile.\n'); + return; + else + display('---------------------------------------------------'); + display(['# of strains: ', num2str(size(data_allele,1))]); + display(['# of genes: ', num2str(size(data_allele,2)-1)]); + end + + %waitALittle; + % Ask the individual gene sequence + % [isOK,returnvalue] = askSeq; + % if (isOK & returnvalue~= 1) % allelic profile loaded only + % data = data_allele; + % component_mat = [1:size(data,2)-1]'; % assume independence + % elseif + % isOK & returnvalue == 1, + [s3,v3] = listdlg('PromptString','Select genes:',... + 'SelectionMode','multiple',... + 'Name','Select Genes',... + 'ListString',genename); + + if isempty(s3) || ~v3 + dispCancel; + return + else + m = size(s3,2); % number of genes + % data_seq = cell(m,1); + data = []; + genesize = zeros(1,m); + for i=1:m + %waitALittle; + data_gene = readfasta(genename{s3(i)}); % read fasta + if (isempty(data_gene)) + display('*** ERROR: Failed in loading the sequence data'); + return; + else + data_gene(:,find(sum(data_gene)==0)) = []; % NB! remove all the gaps + % data_seq{i} = data_gene; + selected_data = data_gene(data_allele(:,s3(i))',:); % Store only those selected strains + + emptyloci = find(all(selected_data(:,[1:end])<0)); + if ~isempty(emptyloci) + disp('Removing empty loci...'); + end + selected_data(:,emptyloci) = []; % remove empty loci + data = [data selected_data]; + end + genesize(i) = size(selected_data,2); % NB! could be different than the original gene length + end + data = [data data_allele(:,end)]; % add the index + % determine the component matrix + component_mat = zeros(m,max(genesize)); + cum = cumsum(genesize); + component_mat(1,1:genesize(1)) = 1:cum(1); + for i=2:m + component_mat(i,1:genesize(i)) = (cum(i-1)+1):cum(i); + end + end + % elseif isOK ==0 + % return + % end + otherwise + return + end + + %waitALittle; + display('---------------------------------------------------'); + fprintf(1,'Preprocessing the data ...'); + + % Make the missing data complete + % missing values are denoted as -999 + data = uint16(data); + % data = uint8(data); + data = makecomplete(data); + if isempty(data) + display('*** ERROR: Failed in completing the missing data'); + return; + end + + isRational = isTheLoadedNewDataRational(data); + if isRational == 0 + return; + end + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(data); + + + % Distance between individuals is computed as if the loci are + % independent. + [Z,dist] = newGetDistances(data,rowsFromInd); + fprintf(1,'Finished.\n'); + ninds = max(data(:,end)); + popnames = fixPopnames(popnames, ninds); + + c.data = uint16(data); c.rowsFromInd = rowsFromInd; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.popnames = popnames; c.component_mat = component_mat; + c.dist = dist; c.Z = Z; + + %waitALittle; + save_preproc = questdlg('Do you wish to save the pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + else + kokonimi = [pathname filename]; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + end + + end; + + %waitALittle; + linkage_model = questdlg('Specify the linkage model',... + 'Specify the linkage model?',... + 'Linear','Codon', 'Independent', 'Linear'); + if isequal(linkage_model,'Linear') + linkage_model = 'linear'; + display('Linear model was selected.'); + elseif isequal(linkage_model,'Codon') + linkage_model = 'codon'; + display('Codon model was selected.'); + elseif isequal(linkage_model,'Independent') + display('Independent model was selected.'); + c.data = double(c.data); + greedyMix(c); + return; + else + dispCancel; + return; + end; + + % Data transformation + fprintf(1,'Transforming the data ...'); + index = data(:,end); + [data_clique, data_separator, noalle_clique, noalle_separator] = ... + transform4(data, component_mat, linkage_model); + data_clique = [data_clique index]; + data_separator = [data_separator index]; + + % Count the data + [counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq]... + = allfreqsnew2(data_clique, double(noalle_clique)); + [counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp]... + = allfreqsnew2(data_separator, double(noalle_separator)); + + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + fprintf(1,'Finished.\n'); + + clear data_clique data_separator + + save_preproc = questdlg('Do you wish to save the fully pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %waitALittle; + [filename, pathname] = uiputfile('*.mat','Save fully pre-processed data as'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + else + kokonimi = [pathname filename]; + c.counts_cq = counts_cq; c.adjprior_cq = adjprior_cq; + c.counts_sp = counts_sp; c.adjprior_sp = adjprior_sp; + c.linkage_model = linkage_model; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + end + end; + clear c; + + case 'BAPS-format' + + input_type = questdlg('Specify the format of your data: ',... + 'Specify BAPS Format', ... + 'BAPS sequence data', 'BAPS numeric data', 'BAPS sequence data'); + switch input_type + case 'BAPS numeric data' + %waitALittle; + setWindowOnTop(base,'false') + [filename,pathname] = uigetfile('*.txt', 'Load BAPS numeric data'); + if (sum(filename)==0) || (sum(pathname)==0) + %cancel was pressed; do nothing. + return; + end; + + display('---------------------------------------------------'); + display(['Reading BAPS numeric data from: ',[pathname filename],'...']); + + try + data = load([pathname filename]); + catch + disp('*** ERROR: Incorrect BAPS numerical data.'); + return + end + case 'BAPS sequence data' + waitALittle;%waitALittle; + [data, filename] = readbaps; + if isempty(data) + return + end + otherwise + return; + end + + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + waitALittle;%waitALittle; + input_pops = questdlg(['When using data which are in BAPS-format, '... + 'you can specify the sampling populations of the individuals by '... + 'giving two additional files: one containing the names of the '... + 'populations, the other containing the indices of the first '... + 'individuals of the populations. Do you wish to specify the '... + 'sampling populations?'], ... + 'Specify sampling populations?',... + 'Yes', 'No', 'No'); + if isequal(input_pops,'Yes') + %waitALittle; + display('Reading name and index files...'); + setWindowOnTop(base,'false') + [namefile, namepath] = uigetfile('*.txt', 'Load population names'); + if namefile==0 + kysyToinen = 0; + else + kysyToinen = 1; + end + if kysyToinen==1 + %waitALittle; + setWindowOnTop(base,'false') + [indicesfile, indicespath] = uigetfile('*.txt', 'Load population indices'); + if indicesfile==0 + % popnames = []; + dispCancel; + return + else + popnames = initPopNames([namepath namefile],[indicespath indicesfile]); + end + else + % popnames = []; + dispCancel; + return + end + else + % popnames = []; + popnames(:,1) = num2cell(unique(data(:,end))); + popnames(:,2) = popnames(:,1); + ninds = max(data(:,end)); + popnames = fixPopnames(popnames, ninds); + end + + % check that popnames is correct + if isempty(popnames) + display('*** ERROR: error in reading popnames.') + return + end + + data = uint16(data); + % Check that the data is rational: + isRational = isTheLoadedNewDataRational(data); + if isRational == 0 + msgbox(['Loaded file contained incorrect data. The last column of the ' ... + 'data file must contain sampling unit identifiers. Identifier specifies the ' ... + 'unit from which the genetic data on that particular row was collected. ' ... + 'Identifiers must be positive integers. If the biggest unit identifier is i.e. 27, there must ' ... + 'be at least one row for each unit 1-27'] ,'Error', ... + 'error'); + disp('*** ERROR: Failed in loading the BAPS data.'); + return; + else + display(['# of haplotypes: ', num2str(size(data,1))]); + display(['# of loci: ', num2str(size(data,2)-1)]); + % display('Finished.'); + end; + + % Check if the data is discrete or continuous + if any(any(fix(data)~=data)) + disp('Found decimal numbers. Continuous model will be used.'); +% input_type = questdlg('Choose the method for the continuous data: ',... +% 'Specify the method', ... +% 'BEC', 'Gibbs sampling with WINBUGS','BEC'); +% switch input_type +% case 'BEC' +% disp('Using the BEC mixture model...'); +% becMixture(data, popnames); +% case 'Gibbs sampling with WINBUGS' +% disp('Preparing the WINBUGS code...'); +% isok = makeBUGS(data); +% if isok disp('Finished.'); +% +% end +% otherwise +% return +% end + disp('** CANCELLED: continuous model is under construction.'); + return; + end + + display('---------------------------------------------------'); + fprintf(1,'Preprocessing the data ...'); + % Make the missing data complete + data = makecomplete(data); + if isempty(data) + display('*** ERROR: Failed in completing the missing data'); + return; + end + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(data); + + % Distance between individuals is computed as if the loci are + % independent. + [Z,dist] = newGetDistances(data,rowsFromInd); + fprintf(1,'Finished.\n'); + + c.data = uint16(data); c.rowsFromInd = rowsFromInd; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.dist = dist; c.popnames = popnames; + c.Z = Z; + + input_linkage = questdlg('Do you wish to load the linkage map?',... + 'Load Linkage Map',... + 'Yes','No','Yes'); + if isequal(input_linkage,'Yes'); + display('---------------------------------------------------'); + %%waitALittle; + setWindowOnTop(base,'false') + [linkage_filename, linkage_pathname] = uigetfile('*.txt', 'Load Linkage Map'); + + if isempty(linkage_filename) && isempty(linkage_pathname) + return; + else + display(['Reading linkage map from: ',[linkage_pathname linkage_filename],'...']); + end; + + try + component_mat = load([linkage_pathname linkage_filename]); + catch + disp('*** ERROR: Incorrect linkage map.'); + return; + end + + % Check if the linkage map matches the data + if (size(data,2)-1) ~= max(component_mat(:)) + msgbox(['Loading of the specified file was unsuccessful. ' ... + 'The linkage map dose not match with the data.'] ,'Error', ... + 'error'); + disp('*** ERROR: Failed in loading the linkage map.'); + return; + else + display(['# of linkage groups: ', num2str(size(component_mat,1))]); + end; + display('---------------------------------------------------'); + h0 = findobj('Tag','filename1_text'); + set(h0,'String',[filename '/' linkage_filename]); clear h0; + c.component_mat = component_mat; + else + display('Independent model was selected.'); + c.data = double(c.data); + greedyMix(c); + return; + end + + + save_preproc = questdlg('Do you wish to save the pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %%waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + kokonimi = [pathname filename]; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + end; + + linkage_model = questdlg('Specify the linkage model',... + 'Specify the linkage model?',... + 'Linear','Codon', 'Independent', 'Linear'); + if isequal(linkage_model,'Linear') + linkage_model = 'linear'; + display('Linear model was selected.'); + elseif isequal(linkage_model,'Codon') + linkage_model = 'codon'; + display('Codon model was selected.'); + elseif isequal(linkage_model,'Independent') + display('Independent model was selected.'); + c.data = double(c.data); + greedyMix(c); + return; + else + dispCancel; + return; + end; + + % Data transformation + % display('---------------------------------------------------'); + fprintf(1,'Transforming the data ...'); + index = data(:,end); + [data_clique, data_separator, noalle_clique, noalle_separator] = ... + transform4(data, component_mat, linkage_model); + data_clique = [data_clique index]; + data_separator = [data_separator index]; + + [counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq]... + = allfreqsnew2(data_clique, double(noalle_clique)); + clear data_clique; + [counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp]... + = allfreqsnew2(data_separator, double(noalle_separator)); + clear data_separator; + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + fprintf(1,'Finished.\n'); + + save_preproc = questdlg('Do you wish to save the fully pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %waitALittle; + [filename, pathname] = uiputfile('*.mat','Save fully pre-processed data as'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + kokonimi = [pathname filename]; + c.counts_cq = counts_cq; c.adjprior_cq = adjprior_cq; + c.counts_sp = counts_sp; c.adjprior_sp = adjprior_sp; + c.linkage_model = linkage_model; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + end; + clear c; + + case 'Pre-processed data' + % This is basically the same format as the "Pre-processed data" in + % the basic clustering. The only difference is that the file + % includes also the component_mat + % %waitALittle; + setWindowOnTop(base,'false') + [filename, pathname] = uigetfile('*.mat', 'Load pre-processed data'); + if filename==0 + return; + end + display('---------------------------------------------------'); + display(['Reading preprocessed data from: ',[pathname filename],'...']); + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + + struct_array = load([pathname filename]); + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'dist') + display('*** ERROR: Incorrect file format'); + return + end + clear struct_array; + elseif isfield(struct_array,'dist') %Mideva versio + c = struct_array; + clear struct_array; + else + display('*** ERROR: Incorrect file format'); + return; + end + + % The following are the same as in the basic clustering + data = c.data; popnames = c.popnames; Z = c.Z; + noalle = c.noalle; adjprior = c.adjprior; + rowsFromInd = c.rowsFromInd; alleleCodes = c.alleleCodes; + dist = c.dist; priorTerm = c.priorTerm; + + if ~isfield(c,'component_mat') + display('*** ERROR: Incorrect file format'); + return + end + + % This is new + component_mat = c.component_mat; + data = uint16(data); + + display(['# of haplotypes: ', num2str(size(data,1))]); + display(['# of loci: ', num2str(size(data,2)-1)]); + display(['# of linkage groups: ', num2str(size(component_mat,1))]); + + if ~isfield(c, 'linkage_model') + %%%waitALittle; + % Independent is not an option, since it can be computed with the + % basic clustering which is much faster + linkage_model = questdlg('Specify the linkage model',... + 'Specify the linkage model?',... + 'Linear','Codon','Linear'); + if isequal(linkage_model,'Linear') + linkage_model = 'linear'; + display('Linear model was selected.'); + elseif isequal(linkage_model,'Codon') + linkage_model = 'codon'; + display('Codon model was selected.'); + else + dispCancel; + return; + end; + + clear c; % save the memory usage + + % Data transformation + fprintf(1,'Transforming the data ...'); + index = data(:,end); +% [data_clique, data_separator, noalle_clique, noalle_separator] = ... +% transform2(data, component_mat, linkage_model); + [data_clique, data_separator, noalle_clique, noalle_separator] = ... + transform4(data, component_mat, linkage_model); + data_clique = [data_clique index]; + data_separator = [data_separator index]; + + [counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq]... + = allfreqsnew2(data_clique, double(noalle_clique)); + clear data_clique; + [counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp]... + = allfreqsnew2(data_separator, double(noalle_separator)); + clear data_separator; + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + fprintf(1,'Finished.\n'); + + save_preproc = questdlg('Do you wish to save the fully pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %%%waitALittle; + [filename, pathname] = uiputfile('*.mat','Save fully pre-processed data as'); + if (sum(filename)==0) || (sum(pathname)==0) + return; + end + + kokonimi = [pathname filename]; c.data = data; + c.counts_cq = counts_cq; c.adjprior_cq = adjprior_cq; + c.counts_sp = counts_sp; c.adjprior_sp = adjprior_sp; + c.linkage_model = linkage_model; + c.rowsFromInd = rowsFromInd; + c.alleleCodes = alleleCodes; + c.noalle = noalle; + c.adjprior = adjprior; + c.priorTerm = priorTerm; + c.popnames = popnames; + c.component_mat = component_mat; + c.dist = dist; c.Z = Z; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + end; + clear c; + else + %Linkage model is specified in the preprocessed file. + counts_cq = c.counts_cq; adjprior_cq = c.adjprior_cq; + counts_sp = c.counts_sp; adjprior_sp = c.adjprior_sp; + linkage_model = c.linkage_model; + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + clear c; + display(['linkage model: ', linkage_model]); + end + + otherwise + return; +end + + +global POP_LOGML; global PARTITION; +global CQ_COUNTS; global SP_COUNTS; %These counts are for populations +global CQ_SUMCOUNTS; global SP_SUMCOUNTS; %not for individuals +clearGlobalVars; + +c.noalle = noalle; +c.adjprior = adjprior; %priorTerm = c.priorTerm; +c.rowsFromInd = rowsFromInd; +c.counts_cq = counts_cq; c.adjprior_cq = adjprior_cq; +c.counts_sp = counts_sp; c.adjprior_sp = adjprior_sp; +c.dist = dist; c.Z = Z; + + +if fixedK + [logml, npops, partitionSummary] = linkageMix_fixK(c); +else + [logml, npops, partitionSummary] = linkageMix(c); +end + +if logml==1 + return; +end + +h0 = findobj('Tag','filename1_text'); inp = get(h0,'String'); +h0 = findobj('Tag','filename2_text'); +outp = get(h0,'String'); + +%This is basically the same as in BAPS 3. +changesInLogml = writeMixtureInfo(logml, counts_cq, counts_sp, adjprior_cq, ... + adjprior_sp, outp, inp, partitionSummary, popnames, linkage_model, ... + fixedK); + +viewMixPartition(PARTITION, popnames); + +% --------------------------------------------------------------------- +% Save the result. +% Jing - 26.12.2005 +talle = questdlg(['Do you want to save the mixture populations ' ... + 'so that you can use them later in admixture analysis?'], ... + 'Save results?','Yes','No','Yes'); +if isequal(talle,'Yes') + %%%waitALittle; + [filename, pathname] = uiputfile('*.mat','Save results as'); + + if (sum(filename)==0) || (sum(pathname)==0) + % Cancel was pressed + return + else % copy 'baps4_output.baps' into the text file with the same name. + if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',[pathname filename '.txt']) + delete('baps4_output.baps') + end + end + + [sumcounts, counts] = indLociCounts(PARTITION, data, npops, noalle); + % NB! Index column is removed in data matrix. + c.PARTITION = PARTITION; c.CQ_COUNTS = CQ_COUNTS; c.CQ_SUMCOUNTS = CQ_SUMCOUNTS; + c.SP_COUNTS = SP_COUNTS; c.SP_SUMCOUNTS = SP_SUMCOUNTS; + c.alleleCodes = alleleCodes; c.adjprior_cq = adjprior_cq; c.adjprior_sp = adjprior_sp; c.popnames = popnames; + c.rowsFromInd = rowsFromInd; c.data = uint16(data); c.npops = npops; + % c.nalleles_cq = nalleles_cq; c.nalleles_sp = nalleles_sp; + if strcmp(linkage_model,'linear') % Added on 03.11.06 + c.mixtureType = 'linear_mix'; + elseif strcmp(linkage_model,'codon') + c.mixtureType = 'codon_mix'; + end + c.changesInLogml = changesInLogml; % this variable stores the change of likelihoods. + % [ncluster ninds] + % -Added on 02.11.2006 + + % The next ones are for the admixture input + c.COUNTS = counts; c.SUMCOUNTS = sumcounts; + c.adjprior = adjprior; c.rowsFromInd = rowsFromInd; c.noalle = noalle; c.alleleCodes = alleleCodes; + + % The two variables below are for the new linkage admixture model + c.gene_lengths = calcGeneLengths(component_mat); + + % The logml is saved for parallel computing + c.logml = logml; + + fprintf(1,'Saving the result...') + try +% save([pathname filename], 'c'); + save([pathname filename], 'c', '-v7.3'); % added by Lu Cheng, 08.06.2012 + fprintf(1,'Finished.\n'); + catch + display('*** ERROR in saving the result.'); + end +else + if exist('baps4_output.baps','file') + delete('baps4_output.baps') + end +end +% ----------------------------------------------------------------------- + + +%-------------------------------------------------------------------------- +% The next three functions are for computing the initial partition +% according to the distance between the individuals + +function initial_partition=admixture_initialization(nclusters,Z) +T=cluster_own(Z,nclusters); +initial_partition=T; + +%-------------------------------------------------------------------------- +function T = cluster_own(Z,nclust) +% true=logical(1); +% false=logical(0); + +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); +% maximum number of clusters based on inconsistency +if m <= maxclust + T = (1:m)'; +elseif maxclust==1 + T = ones(m,1); +else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end +end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + +%-------------------------------------------------------------------------- + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäär? + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + +%-------------------------------------------------------------------------- + +function Z = computeLinkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 || m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +% monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + +%-------------------------------------------------------------------------- + +function changes = computeChanges(ind, adjprior_cq, adjprior_sp, ... + indCqCounts, indSpCounts) +% Computes changes in log-marginal likelihood if individual ind is +% moved to another population +% +% Input: +% ind - the individual to be moved +% adjprior_cq & _sp - adjpriors for cliques and separators +% indCqCounts, indSpCounts - counts for individual ind +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% ind is move to population i. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(CQ_COUNTS,3); +changes = zeros(npops,1); + +i1 = PARTITION(ind); +i1_logml = POP_LOGML(i1); +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +new_i1_logml = computePopulationLogml(i1, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)+indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)+sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)+indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)+sumSp; + + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:) + repmat(sumSp,[npops-1 1]); + +new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:) - repmat(sumSp,[npops-1 1]); +% a = repmat(sumSp,[npops-1 1]); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + +%------------------------------------------------------------------------------------ + +function changes = computeChanges2(i1, adjprior_cq, adjprior_sp) +% Computes changes in log marginal likelihood if population i1 is combined +% with another population +% +% Input: +% i1 - the population to be combined +% adjprior_cq & _sp - adjpriors for cliques and separators +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% i1 is combined with population i. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; +npops = size(CQ_COUNTS,3); +changes = zeros(npops,1); + +i1_logml = POP_LOGML(i1); +indCqCounts = CQ_COUNTS(:,:,i1); +indSpCounts = SP_COUNTS(:,:,i1); +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +new_i1_logml = 0; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ repmat(sumSp,[npops-1 1]); +% a = repmat(sumSp,[npops-1 1]); +% if ~any(sumSp) +% a(:,[1:size(a,2)])=[]; +% end +% SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ a ; + + +new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)- repmat(sumSp,[npops-1 1]); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + + + +%------------------------------------------------------------------------------------ + + +function changes = computeChanges3(T2, inds2, i1, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Computes changes in log marginal likelihood if subpopulation of i2 is +% moved to another population +% +% Input: +% T2 - partition of inds2 to subpopulations +% inds2 - individuals in population i1 +% i2 +% counts_cq, counts_sp - counts for individuals +% +% Output: +% changes - table of size length(unique(T2))*npops. +% changes(i,j) = difference in logml if subpopulation inds2(find(T2==i)) of +% i2 is moved to population j + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; +npops = size(CQ_COUNTS,3); +npops2 = length(unique(T2)); +changes = zeros(npops2,npops); + +%cq_counts = CQ_COUNTS; +%sp_counts = SP_COUNTS; +%cq_sumcounts = CQ_SUMCOUNTS; +%sp_sumcounts = SP_SUMCOUNTS; + + +i1_logml = POP_LOGML(i1); + +for pop2 = 1:npops2 + % inds = inds2(find(T2==pop2)); + inds = inds2(logical(T2==pop2)); + ninds = length(inds); + if ninds>0 + indCqCounts = uint16(sum(counts_cq(:,:,inds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,inds),3)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; + CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; + SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + + new_i1_logml = computePopulationLogml(i1, adjprior_cq, adjprior_sp); + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)+indCqCounts; + CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)+sumCq; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)+indSpCounts; + SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)+sumSp; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); + CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); + SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ repmat(sumSp,[npops-1 1]); + + new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp)'; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); + CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); + SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)- repmat(sumSp,[npops-1 1]); + + changes(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + +%-------------------------------------------------------------------------- + +function changes = computeChanges5(inds, i1, i2, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Computes change in logml if individual of inds is moved between +% populations i1 and i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; global PARTITION; + +ninds = length(inds); +changes = zeros(ninds,1); + +i1_logml = POP_LOGML(i1); +i2_logml = POP_LOGML(i2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %from + pop2 = i2; %to + else + pop1 = i2; + pop2 = i1; + end + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,pop1) = CQ_COUNTS(:,:,pop1)-indCqCounts; + CQ_SUMCOUNTS(pop1,:) = CQ_SUMCOUNTS(pop1,:)-sumCq; + SP_COUNTS(:,:,pop1) = SP_COUNTS(:,:,pop1)-indSpCounts; + SP_SUMCOUNTS(pop1,:) = SP_SUMCOUNTS(pop1,:) - sumSp; + + CQ_COUNTS(:,:,pop2) = CQ_COUNTS(:,:,pop2)+indCqCounts; + CQ_SUMCOUNTS(pop2,:) = CQ_SUMCOUNTS(pop2,:)+sumCq; + SP_COUNTS(:,:,pop2) = SP_COUNTS(:,:,pop2)+indSpCounts; + SP_SUMCOUNTS(pop2,:) = SP_SUMCOUNTS(pop2,:) + sumSp; + + new_logmls = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + changes(i) = sum(new_logmls); + + CQ_COUNTS(:,:,pop1) = CQ_COUNTS(:,:,pop1)+indCqCounts; + CQ_SUMCOUNTS(pop1,:) = CQ_SUMCOUNTS(pop1,:)+sumCq; + SP_COUNTS(:,:,pop1) = SP_COUNTS(:,:,pop1)+indSpCounts; + SP_SUMCOUNTS(pop1,:) = SP_SUMCOUNTS(pop1,:)+sumSp; + CQ_COUNTS(:,:,pop2) = CQ_COUNTS(:,:,pop2)-indCqCounts; + CQ_SUMCOUNTS(pop2,:) = CQ_SUMCOUNTS(pop2,:)-sumCq; + SP_COUNTS(:,:,pop2) = SP_COUNTS(:,:,pop2)-indSpCounts; + SP_SUMCOUNTS(pop2,:) = SP_SUMCOUNTS(pop2,:)-sumSp; +end + +changes = changes - i1_logml - i2_logml; + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, indCqCounts, indSpCounts, ... + adjprior_cq, adjprior_sp) +% Updates global variables when individual ind is moved to population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+indCqCounts; +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+sumCq; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+indSpCounts; +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+sumSp; + + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2(i1, i2, adjprior_cq, adjprior_sp) +% Updates global variables when all individuals from population i1 are moved +% to population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; + +% inds = find(PARTITION==i1); +% PARTITION(inds) = i2; +PARTITION(logical(PARTITION==i1)) = i2; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+CQ_COUNTS(:,:,i1); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+CQ_SUMCOUNTS(i1,:); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+SP_COUNTS(:,:,i1); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+SP_SUMCOUNTS(i1,:); + +CQ_COUNTS(:,:,i1) = 0; +CQ_SUMCOUNTS(i1,:) = 0; +SP_COUNTS(:,:,i1) = 0; +SP_SUMCOUNTS(i1,:) = 0; + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, i2, indCqCounts, indSpCounts, ... + adjprior_cq, adjprior_sp) +% Updates global variables when individuals muuttuvat are moved to +% population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+indCqCounts; +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+sumCq; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+indSpCounts; +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+sumSp; + + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Returns individuals inds in order according to the change in the logml if +% they are moved out of the population pop + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind = inds(i); + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,pop) = CQ_COUNTS(:,:,pop)-indCqCounts; + CQ_SUMCOUNTS(pop,:) = CQ_SUMCOUNTS(pop,:)-sumCq; + SP_COUNTS(:,:,pop) = SP_COUNTS(:,:,pop)-indSpCounts; + SP_SUMCOUNTS(pop,:) = SP_SUMCOUNTS(pop,:)-sumSp; + + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior_cq, adjprior_sp); + + CQ_COUNTS(:,:,pop) = CQ_COUNTS(:,:,pop)+indCqCounts; + CQ_SUMCOUNTS(pop,:) = CQ_SUMCOUNTS(pop,:)+sumCq; + SP_COUNTS(:,:,pop) = SP_COUNTS(:,:,pop)+indSpCounts; + SP_SUMCOUNTS(pop,:) = SP_SUMCOUNTS(pop,:)+sumSp; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + + +%------------------------------------------------------------------------- + +function [Z, dist] = newGetDistances(data, rowsFromInd) + +ninds = max(data(:,end)); +nloci = size(data,2)-1; +riviLkm = nchoosek(double(ninds),2); + +% empties = find(data<0); +% data(empties)=0; +data(logical(data<0)) = 0; +data = uint16(data); + +pariTaulu = zeros(riviLkm,2); +aPointer=1; + +for a=1:ninds-1 + pariTaulu(aPointer:aPointer+double(ninds-1-a),1) = ones(ninds-a,1,'uint16')*a; + pariTaulu(aPointer:aPointer+double(ninds-1-a),2) = uint16((a+1:ninds)'); + aPointer = aPointer+double(ninds-a); +end + +eka = pariTaulu(:,ones(1,rowsFromInd)); +eka = eka * rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]); +eka = eka - miinus; + +toka = pariTaulu(:,ones(1,rowsFromInd)*2); +toka = toka * rowsFromInd; +toka = toka - miinus; + +eka = uint16(eka); +toka = uint16(toka); + +clear pariTaulu; clear miinus; + +summa = uint16(zeros(riviLkm,1)); +vertailuja = uint16(zeros(riviLkm,1)); + +x = zeros(size(eka)); x = uint16(x); +y = zeros(size(toka)); y = uint16(y); +% fprintf(1,'%%10'); +for j=1:nloci; + + for k=1:rowsFromInd + x(:,k) = data(eka(:,k),j); + y(:,k) = data(toka(:,k),j); + end + + for a=1:rowsFromInd + for b=1:rowsFromInd + vertailutNyt = uint16(x(:,a)>0 & y(:,b)>0); + vertailuja = vertailuja + vertailutNyt; + lisays = (x(:,a)~=y(:,b) & vertailutNyt); + summa = summa + uint16(lisays); + end + end + % fprintf(1,'\b\b'); + % fprintf(1,'%d',floor(10+80*j/nloci)); +end + +clear x; clear y; clear vertailutNyt; +clear eka; clear toka; clear data; clear lisays; +dist = zeros(length(vertailuja),1); +% nollat = find(vertailuja==0); +% dist(nollat) = 1; +dist(logical(vertailuja==0)) = 1; +muut = find(vertailuja>0); +dist(muut) = double(summa(muut))./double(vertailuja(muut)); +clear summa; clear vertailuja; clear muut; + +Z = computeLinkage(dist'); +% fprintf(1,'\b\b'); +% fprintf(1,'%d\n',100); +%-------------------------------------------------------------------------- + +function clearGlobalVars + +global CQ_COUNTS; CQ_COUNTS = []; +global CQ_SUMCOUNTS; CQ_SUMCOUNTS = []; +global SP_COUNTS; SP_COUNTS = []; +global SP_SUMCOUNTS; SP_SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; + +%-------------------------------------------------------------------------- + +function npops = removeEmptyPops +% Removes empty pops from all global COUNTS variables. +% Updates PARTITION and npops + +global CQ_COUNTS; +global CQ_SUMCOUNTS; +global SP_COUNTS; +global SP_SUMCOUNTS; +global PARTITION; + +notEmpty = find(any(CQ_SUMCOUNTS,2)); +CQ_COUNTS = CQ_COUNTS(:,:,notEmpty); +CQ_SUMCOUNTS = CQ_SUMCOUNTS(notEmpty,:); +SP_COUNTS = SP_COUNTS(:,:,notEmpty); +SP_SUMCOUNTS = SP_SUMCOUNTS(notEmpty,:); + +for n=1:length(notEmpty) +% apu = find(PARTITION==notEmpty(n)); +% PARTITION(apu)=n; +PARTITION(logical(PARTITION==notEmpty(n))) = n; +end +npops = length(notEmpty); + +%-------------------------------------------------------------------------- + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. +global PARTITION; +apu = isempty(find(abs(partitionSummary(:,2)-logml)<1e-5,1)); +if apu + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + +%-------------------------------------------------------------------------- + +function [counts, sumcounts] = initialCounts(ind_counts) + +global PARTITION; + +pops = unique(PARTITION); +npops = max(pops); + +counts = zeros(size(ind_counts,1), size(ind_counts,2), npops,'uint16'); +sumcounts = zeros(npops, size(ind_counts,2),'uint16'); + +for i = 1:npops + inds = find(PARTITION == i); + counts(:,:,i) = sum(ind_counts(:,:,inds), 3); + sumcounts(i,:) = sum(counts(:,:,i),1); +end + +%-------------------------------------------------------------------------- + +function logml = computeLogml(adjprior_cq, adjprior_sp) + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +cq_counts = double(CQ_COUNTS); +cq_sumcounts = double(CQ_SUMCOUNTS); +sp_counts = double(SP_COUNTS); +sp_sumcounts = double(SP_SUMCOUNTS); + +npops = size(CQ_COUNTS, 3); + +cq_logml = sum(sum(sum(gammaln(cq_counts+repmat(adjprior_cq,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior_cq))) - ... + sum(sum(gammaln(1+cq_sumcounts))); + +sp_logml = sum(sum(sum(gammaln(sp_counts+repmat(adjprior_sp,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior_sp))) - ... + sum(sum(gammaln(1+sp_sumcounts))); + +logml = cq_logml - sp_logml; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%-------------------------------------------------------------------------- + +function popLogml = computePopulationLogml(pops, adjprior_cq, adjprior_sp) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +cq_counts = double(CQ_COUNTS); +cq_sumcounts = double(CQ_SUMCOUNTS); +sp_counts = double(SP_COUNTS); +sp_sumcounts = double(SP_SUMCOUNTS); + +nall_cq = size(CQ_COUNTS,1); +nall_sp = size(SP_COUNTS, 1); +ncliq = size(CQ_COUNTS,2); +nsep = size(SP_COUNTS, 2); + +z = length(pops); + +popLogml_cq = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_cq,[1 1 z]) + cq_counts(:,:,pops)) ... + ,[nall_cq ncliq z]),1),2)) - sum(gammaln(1+cq_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_cq))); + +popLogml_sp = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_sp,[1 1 z]) + sp_counts(:,:,pops)) ... + ,[nall_sp nsep z]),1),2)) - sum(gammaln(1+sp_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_sp))); + +popLogml = popLogml_cq - popLogml_sp; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%------------------------------------------------------------------- + + +function changesInLogml = writeMixtureInfo(logml, counts_cq, counts_sp, adjprior_cq, ... + adjprior_sp, outPutFile, inputFile, partitionSummary, popnames, linkage_model,... + fixedK) + +global PARTITION; +global CQ_COUNTS; +global LOGDIFF; + +%global CQ_SUMCOUNTS; +%global SP_COUNTS; global SP_SUMCOUNTS; +ninds = length(PARTITION); +npops = size(CQ_COUNTS,3); +names = (size(popnames,1) == ninds); %Tarkistetaan ett?nimet viittaavat yksilöihin + +if length(outPutFile)>0 + fid = fopen(outPutFile,'a'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:'); +disp(['Data file/ Linkage map: ' inputFile]); +disp(['Model: ' linkage_model]); +disp(['Number of clustered individuals: ' ownNum2Str(ninds)]); +disp(['Number of groups in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%s \n', ['RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Data file: ' inputFile]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clustered individuals: ' ownNum2Str(ninds)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of groups in optimal partition: ' ownNum2Str(npops)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); fprintf(fid,'\n'); +end + +cluster_count = length(unique(PARTITION)); +disp('Best Partition: '); +if (fid ~= -1) + fprintf(fid,'%s \n','Best Partition: '); fprintf(fid,'\n'); +end +for m=1:cluster_count + indsInM = find(PARTITION==m); + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + if names + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + else + text = ['Cluster ' num2str(m) ': {' num2str(indsInM(1))]; + for k = 2:cluster_size + text = [text ', ' num2str(indsInM(k))]; + end; + end + text = [text '}']; + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',new_line); + fprintf(fid,'\n'); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',text); + fprintf(fid,'\n'); + end + end; +end + +if npops == 1 + changesInLogml = []; +else + disp(' '); + disp(' '); + disp('Changes in log(marginal likelihood) if indvidual i is moved to group j:'); + if (fid ~= -1) + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', 'Changes in log(marginal likelihood) if indvidual i is moved to group j:'); fprintf(fid, '\n'); + end + + if names + nameSizes = zeros(ninds,1); + for i = 1:ninds + nimi = char(popnames{i}); + nameSizes(i) = length(nimi); + end + maxSize = max(nameSizes); + maxSize = max(maxSize, 5); + erotus = maxSize - 5; + alku = blanks(erotus); + ekarivi = [alku ' ind' blanks(6+erotus)]; + else + ekarivi = ' ind '; + end + + for i = 1:cluster_count + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; + end + disp(ekarivi); + if (fid ~= -1) + fprintf(fid, '%s \n', ekarivi); fprintf(fid, '\n'); + end + + %ninds = size(data,1)/rowsFromInd; + changesInLogml = LOGDIFF'; + for ind = 1:ninds + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + %changesInLogml(:,ind) = computeChanges(ind, adjprior_cq, ... + % adjprior_sp, indCqCounts, indSpCounts); + if names + nimi = char(popnames{ind}); + rivi = [blanks(maxSize - length(nimi)) nimi ':']; + else + rivi = [blanks(4-floor(log10(ind))) ownNum2Str(ind) ':']; + end + for j = 1:npops + rivi = [rivi ' ' logml2String(omaRound(changesInLogml(j,ind)))]; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', rivi); fprintf(fid, '\n'); + end + end + + + % % KL-divergence has to be calculated otherwise... + % % { + % disp(' '); disp(' '); + % disp('KL-divergence matrix:'); + % + % if (fid ~= -1) + % fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + % fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + % fprintf(fid, '%s \n', ['KL-divergence matrix:']); fprintf(fid, '\n'); + % end + % + % maxnoalle = size(COUNTS,1); + % nloci = size(COUNTS,2); + % d = zeros(maxnoalle, nloci, npops); + % prior = adjprior; + % prior(find(prior==1))=0; + % nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. + % prior(1,nollia)=1; + % for pop1 = 1:npops + % d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + % dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); + % end + % ekarivi = blanks(7); + % for pop = 1:npops + % ekarivi = [ekarivi num2str(pop) blanks(7-floor(log10(pop)))]; + % end + % disp(ekarivi); + % if (fid ~= -1) + % fprintf(fid, '%s \n', [ekarivi]); fprintf(fid, '\n'); + % end + % + % for pop1 = 1:npops + % rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + % for pop2 = 1:pop1-1 + % dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + % div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + % div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + % div = (div12+div21)/2; + % rivi = [rivi kldiv2str(div) ' ']; + % end + % disp(rivi); + % if (fid ~= -1) + % fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + % end + % end + % % } +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', 'List of sizes of 10 best visited partitions and corresponding log(ml) values'); fprintf(fid, '\n'); +end + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +% partitionSummary = partitionSummary(find(partitionSummary(:,2)>-1e49),:); +partitionSummary = partitionSummary(logical(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1)) ' ' num2str(partitionSummary(part,2))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', line); fprintf(fid, '\n'); + end +end + +if ~fixedK + + disp(' '); + disp(' '); + disp('Probabilities for number of clusters'); + + if (fid ~= -1) + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', 'Probabilities for number of clusters'); fprintf(fid, '\n'); + end + + npopsTaulu = unique(partitionSummary(:,1)); + len = length(npopsTaulu); + probs = zeros(len,1); + partitionSummary(:,2) = partitionSummary(:,2)-max(partitionSummary(:,2)); + sumtn = sum(exp(partitionSummary(:,2))); + for i=1:len + % npopstn = sum(exp(partitionSummary(find(partitionSummary(:,1)==npopsTaulu(i)),2))); + npopstn = sum(exp(partitionSummary(logical(partitionSummary(:,1)==npopsTaulu(i)),2))); + probs(i) = npopstn / sumtn; + end + for i=1:len + if probs(i)>1e-5 + line = [num2str(npopsTaulu(i)) ' ' num2str(probs(i))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', line); fprintf(fid, '\n'); + end + end + end + +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + + +%-------------------------------------------------------------- +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +% newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) && n= 1 + suurinYks = suurinYks+1; + end + if suurinYks<10 + mjono(7) = num2str(suurinYks); + mjono(6) = 'e'; + mjono(5) = palautaYks(abs(logml),suurinYks-1); + mjono(4) = '.'; + mjono(3) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(2) = '-'; + end + elseif suurinYks>=10 + mjono(6:7) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(logml),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(1) = '-'; + end + end +end + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:in? +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmill? +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + +%------------------------------------------------------------------------- + +function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? +% kyseinen rivi on peräisin. Funktio tutkii ensin, ett?montako +% rivi?maksimissaan on peräisin yhdelt?yksilölt? jolloin saadaan +% tietää onko kyseess?haploidi, diploidi jne... Tämän jälkeen funktio +% lisää tyhji?rivej?niille yksilöille, joilta on peräisin vähemmän +% rivej?kuin maksimimäär? +% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin +% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytöss?oleva koodi. +% Tämän jälkeen funktio muuttaa alleelikoodit siten, ett?yhden lokuksen j +% koodit saavat arvoja välill?1,...,noalle(j). + +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +% dataApu = []; +% nollat = []; +% isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + %alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(logical(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + % data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + data(logical(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +nrows = size(data,1); +ncols = size(data,2); +rowsFromInd = zeros(nind,1); +for i=1:nind + rowsFromInd(i) = length(find(data(:,end)==i)); +end +maxRowsFromInd = max(rowsFromInd); +a = -999; +emptyRow = repmat(a, 1, ncols); +lessThanMax = find(rowsFromInd < maxRowsFromInd); +missingRows = maxRowsFromInd*nind - nrows; +data = [data; zeros(missingRows, ncols)]; +pointer = 1; +for ind=lessThanMax' %Käy läpi ne yksilöt, joilta puuttuu rivej? + miss = maxRowsFromInd-rowsFromInd(ind); % Tält?yksilölt?puuttuvien lkm. + for j=1:miss + rowToBeAdded = emptyRow; + rowToBeAdded(end) = ind; + data(nrows+pointer, :) = rowToBeAdded; + pointer = pointer+1; + end +end +data = sortrows(data, ncols); % Sorttaa yksilöiden mukaisesti +newData = data; +rowsFromInd = maxRowsFromInd; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhji? +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end + +%-------------------------------------------------------------------------- + +function popnames = fixPopnames(popnames, ninds) + +if length(popnames) == ninds + for i=1:ninds + if isnumeric(popnames{i}) + popnames{i} = num2str(popnames{i}); + % popnames(i) = num2str(popnames{i}); + end + popnames{i} = cellstr(popnames{i}); + % popnames(i) = cellstr(popnames{i}); + end +end + +%-------------------------------------------------------------------------- +function isRational = isTheLoadedNewDataRational(data) +% The last column of the data must include numbers 1-npops +% If so, isRational = 1, otherwise isRational = 0. +% The row numbers must be larger than 1. +if size(data,1) == 1 + isRational = 0; + display('*** ERROR: Sample size must be larger than one'); + return; +end +last_column = data(:,end); +last_column = sort(last_column); +current = 1; +if last_column(1) ~= current + isRational = 0; + display('*** ERROR: Wrong Indexes in the data'); + return; +end; +lengthcol = length(last_column); +for n = 2:lengthcol + if ~(last_column(n) == current || last_column(n) == current + 1) + %Some population is missing from the last column + isRational = 0; + display('*** ERROR: Missing indexes in the data'); + return; + end; + current = last_column(n); +end; +isRational = 1; + + +% %------------------------------------------------------------------------- +% function isRational = isTheLoadedNewLinkageRational(linkage_data) +% % Each positive element must be unique. +% % If so, isRational = 1, otherwise isRational = 0; +% nonzero = find(linkage_data~=0); +% dif = diff(linkage_data(nonzero)); +% if ~all(dif) +% isRational = 0; return; +% end; +% isRational = 1; + +%-------------------------------------------------------------------------- + +function [sumcounts, counts] = ... + indLociCounts(partition, data, npops, noalle) + +nloci=size(data,2)-1; +% ninds = size(data,1); + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + % havainnotLokuksessa = find(partition==i & data(:,j)>=0); + havainnotLokuksessa = find(ismember(data(:,end),find(partition==i))); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile, indexFile) +%Palauttaa tyhjän, mikäli nimitiedosto ja indeksitiedosto +% eivät olleet yht?pitki? + +popnames = []; +try +indices = load(indexFile); +catch + msgbox('Loading of the index file was unsuccessful', ... + 'Error', 'error'); + return +end +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the name file was unsuccessful', ... + 'Error', 'error'); + return; +end + +line = fgetl(fid); +counter = 1; +while sum(line~=-1) && ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +if length(names) ~= length(indices) + disp('The number of population names must be equal to the number of '); + disp('entries in the file specifying indices of the first individuals of '); + disp('each population.'); + return; +end + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = indices(i); +end diff --git a/matlab/parallel/linkage_admix_parallel.m b/matlab/parallel/linkage_admix_parallel.m new file mode 100644 index 0000000..1d7a061 --- /dev/null +++ b/matlab/parallel/linkage_admix_parallel.m @@ -0,0 +1,1030 @@ +function linkage_admix_parallel(tietue, options) + +global COUNTS; global PARTITION; global SUMCOUNTS; +clearGlobalVars; + +PARTITION = tietue.PARTITION; +COUNTS = tietue.COUNTS; +SUMCOUNTS = tietue.SUMCOUNTS; +rowsFromInd = tietue.rowsFromInd; +data = double(tietue.data); +npops = tietue.npops; +noalle = tietue.noalle; +switch tietue.mixtureType + case 'linear_mix' + linkage_model = 'linear'; + case 'codon_mix' + linkage_model = 'codon'; +end +if isfield(tietue,'gene_lengths') + gene_lengths = tietue.gene_lengths; +else + [filename, pathname] = uigetfile('*.txt', 'Load file with lengths of the genes (same order as in data).'); + gene_lengths = load([pathname filename]); +end + +if length(unique(PARTITION(find(PARTITION>0))))==1 + disp('Only one population in the input file'); + disp('No admixture detected'); + return +end + +%answers = inputdlg({['Input the minimum size of a population that will'... +% ' be taken into account when admixture is estimated.']},... +% 'Input minimum population size',1,{'5'}); +%if isempty(answers), return; end +%alaRaja = str2double(answers{1,1}); +alaRaja = options.minSize; +[npops] = poistaLiianPienet(npops, rowsFromInd, alaRaja); + +nloci = size(COUNTS,2); +ninds = size(data,1)/rowsFromInd; + +iterationCount = options.iters; +nrefIndsInPop = options.refInds; +iterationCountRef = options.refIters; + +[cliq_data, sep_data, cliq_counts, component_mat] = createCliqData(data, gene_lengths, noalle, ... + linkage_model, rowsFromInd); + +% Repeat: simulate clique frequencies and estimate proportions. Save the +% average proportions in "proportionsIt". + +proportionsIt = zeros(ninds,npops); +for iterationNum = 1:iterationCount + disp(['Iter: ' num2str(iterationNum)]); + %allfreqs = simulateAllFreqs(noalle); + [cliq_freqs, sep_freqs] = simulateCliqFreqs(cliq_counts, noalle, component_mat, gene_lengths, linkage_model); + for ind=1:ninds + + %omaFreqs = computePersonalAllFreqs(ind, data, allfreqs, rowsFromInd); + [ownCliqFreqs, ownSepFreqs] = computePersonalCliqueFreqs(ind, ... + cliq_data, cliq_freqs, sep_data, sep_freqs, rowsFromInd, gene_lengths, linkage_model); + osuusTaulu = zeros(1,npops); + if PARTITION(ind)==0 + % Outlier individual + elseif PARTITION(ind)~=0 + if PARTITION(ind)>0 + osuusTaulu(PARTITION(ind)) = 1; + else + % Individuals who are not assigned to any cluster. + arvot = zeros(1,npops); + for q=1:npops + osuusTaulu = zeros(1,npops); + osuusTaulu(q) = 1; + arvot(q) = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, osuusTaulu); + end + [iso_arvo, isoimman_indeksi] = max(arvot); + osuusTaulu = zeros(1,npops); + osuusTaulu(isoimman_indeksi) = 1; + PARTITION(ind)=isoimman_indeksi; + end + logml = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, osuusTaulu); + + for osuus = [0.5 0.25 0.05 0.01] + [osuusTaulu, logml] = searchBest(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml); + end + end + proportionsIt(ind,:) = proportionsIt(ind,:).*(iterationNum-1) + osuusTaulu; + proportionsIt(ind,:) = proportionsIt(ind,:)./iterationNum; + end +end + +disp(['Creating ' num2str(nrefIndsInPop) ' reference individuals from ']); +disp('each population.'); + +%allfreqs = simulateAllFreqs(noalle); % Simuloidaan alleelifrekvenssisetti +%allfreqs = computeAllFreqs2(noalle); % Koitetaan tällaista. +%refData = simulateIndividuals(nrefIndsInPop,rowsFromInd,allfreqs); + +exp_cliq_freqs = ... + computeExpectedFreqs(cliq_counts, noalle, component_mat, gene_lengths, linkage_model); +[ref_cliq_data, ref_sep_data] = ... + simulateLinkageIndividuals(nrefIndsInPop, rowsFromInd, exp_cliq_freqs, ... + gene_lengths, noalle, component_mat, linkage_model); +nrefInds = npops*nrefIndsInPop; + +disp(['Analysing the reference individuals in ' num2str(iterationCountRef) ' iterations.']); +refProportions = zeros(nrefInds,npops); +for iter = 1:iterationCountRef + disp(['Iter: ' num2str(iter)]); + %allfreqs = simulateAllFreqs(noalle); + [cliq_freqs, sep_freqs] = simulateCliqFreqs(cliq_counts, noalle, component_mat, gene_lengths, linkage_model); + for ind = 1:nrefInds + %omaFreqs = computePersonalAllFreqs(ind, refData, allfreqs, rowsFromInd); + [ownCliqFreqs, ownSepFreqs] = computePersonalCliqueFreqs(ind, ... + ref_cliq_data, cliq_freqs, ref_sep_data, sep_freqs, rowsFromInd, gene_lengths, linkage_model); + osuusTaulu = zeros(1,npops); + pop = ceil(ind/nrefIndsInPop); + osuusTaulu(pop)=1; + %logml = computeIndLogml(omaFreqs, osuusTaulu); + logml = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, osuusTaulu); + for osuus = [0.5 0.25 0.05 0.01] + %[osuusTaulu, logml] = searchBest(osuus, osuusTaulu, omaFreqs, logml); + [osuusTaulu, logml] = searchBest(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml); + end + refProportions(ind,:) = refProportions(ind,:).*(iter-1) + osuusTaulu; + refProportions(ind,:) = refProportions(ind,:)./iter; + end +end +refTaulu = zeros(npops,100); +for ind = 1:nrefInds + pop = ceil(ind/nrefIndsInPop); + omanOsuus = refProportions(ind,pop); + if round(omanOsuus*100)==0 + omanOsuus = 0.01; + end + if abs(omanOsuus)<1e-5 + omanOsuus = 0.01; + end + refTaulu(pop, round(omanOsuus*100)) = refTaulu(pop, round(omanOsuus*100))+1; +end + +% Rounding: +proportionsIt = proportionsIt.*100; proportionsIt = round(proportionsIt); +proportionsIt = proportionsIt./100; +for ind = 1:ninds + % if sum not equal to one, fix the largest part. + if (PARTITION(ind)>0) && (sum(proportionsIt(ind,:)) ~= 1) + [isoin,indeksi] = max(proportionsIt(ind,:)); + erotus = sum(proportionsIt(ind,:))-1; + proportionsIt(ind,indeksi) = isoin-erotus; + end +end + +% "p-value" for admixture +uskottavuus = zeros(ninds,1); +for ind = 1:ninds + pop = PARTITION(ind); + if pop==0 % an outlier + uskottavuus(ind)=1; + else + omanOsuus = proportionsIt(ind,pop); + if abs(omanOsuus)<1e-5 + omanOsuus = 0.01; + end + if round(omanOsuus*100)==0 + omanOsuus = 0.01; + end + refPienempia = sum(refTaulu(pop, 1:round(100*omanOsuus))); + uskottavuus(ind) = refPienempia / nrefIndsInPop; + end +end + +tulostaAdmixtureTiedot(proportionsIt, uskottavuus, alaRaja, iterationCount); + +%viewPartition(proportionsIt, popnames); + +fprintf(1,'Saving the result...') +try +% save(options.outputMat, 'c'); + save(options.outputMat, 'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + fprintf(1,'Finished.\n'); +catch + display('*** ERROR in saving the result.'); +end + + +% copy 'baps4_output.baps' into the text file with the same name. +if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',[options.outputMat '.txt']) + delete('baps4_output.baps') +end + +tietue.proportionsIt = proportionsIt; +tietue.pvalue = uskottavuus; % Added by Jing +tietue.admixnpops = npops; +tietue.mixtureType = 'admix'; % added by jing on 09.09.2008 +% save(options.outputMat, 'tietue'); +save(options.outputMat, 'tietue','-v7.3'); % added by Lu Cheng, 08.06.2012 + + +%---------------------------------------------------------------------------- + + +function [npops] = poistaLiianPienet(npops, rowsFromInd, alaraja) +% Muokkaa tulokset muotoon, jossa outlier yksilät on +% poistettu. Tarkalleen ottaen poistaa ne populaatiot, +% joissa on vähemmän kuin 'alaraja':n verran yksiläit? + +global PARTITION; +global COUNTS; +global SUMCOUNTS; + +popSize=zeros(1,npops); +for i=1:npops + popSize(i)=length(find(PARTITION==i)); +end +miniPops = find(popSize0))); +for n=1:length(korit) + kori = korit(n); + yksilot = find(PARTITION==kori); + PARTITION(yksilot) = n; +end +COUNTS(:,:,miniPops) = []; +SUMCOUNTS(miniPops,:) = []; + +npops = npops-length(miniPops); + +%------------------------------------------------------------------------ + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; + +%-------------------------------------------------------- + + +function allFreqs = computeAllFreqs2(noalle) +% Lisää a priori jokaista alleelia +% joka populaation joka lokukseen j 1/noalle(j) verran. + +global COUNTS; +global SUMCOUNTS; + +max_noalle = size(COUNTS,1); +nloci = size(COUNTS,2); +npops = size(COUNTS,3); + +sumCounts = SUMCOUNTS+ones(size(SUMCOUNTS)); +sumCounts = reshape(sumCounts', [1, nloci, npops]); +sumCounts = repmat(sumCounts, [max_noalle, 1 1]); + +prioriAlleelit = zeros(max_noalle,nloci); +for j=1:nloci + prioriAlleelit(1:noalle(j),j) = 1/noalle(j); +end +prioriAlleelit = repmat(prioriAlleelit, [1,1,npops]); +counts = COUNTS + prioriAlleelit; +allFreqs = counts./sumCounts; + +%-------------------------------------------------------------------------- + + +function refData = simulateIndividuals(n,rowsFromInd,allfreqs) +% simuloidaan n yksilää jokaisesta populaatiosta. ( + +npops = size(allfreqs,3); +nloci = size(allfreqs,2); +ninds = n*npops; + +refData = zeros(ninds*rowsFromInd,nloci); +counter = 1; % Pitää kirjaa mille riville seuraavaksi simuloidaan. + +for ind = 1:ninds + pop = ceil(ind/n); + for loc = 1:nloci + for k=0:rowsFromInd-1 + refData(counter+k,loc) = simuloiAlleeli(allfreqs,pop,loc); + end + end + counter = counter+rowsFromInd; +end + +function all = simuloiAlleeli(allfreqs,pop,loc) +% Simuloi populaation pop lokukseen loc alleelin. +freqs = allfreqs(:,loc,pop); +cumsumma = cumsum(freqs); +arvo = rand; +isommat = find(cumsumma>arvo); +all = min(isommat); + +%--------------------------------------------------------------------------- + + +function loggis = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, proportions) +% Calculates the likelihood when the origins are defined by "proportions". + +aux = proportions * ownCliqFreqs; +aux = log(aux); +loggis = sum(aux); + +clear aux; + +aux2 = proportions * ownSepFreqs; +aux2 = log(aux2); +loggis = loggis - sum(aux2); + + +%-------------------------------------------------------------------------- + + +function osuusTaulu = suoritaMuutos(osuusTaulu, osuus, indeksi) +% Päivittää osuusTaulun muutoksen jälkeen. + +global COUNTS; +npops = size(COUNTS,3); + +i1 = rem(indeksi,npops); +if i1==0, i1 = npops; end; +i2 = ceil(indeksi / npops); + +osuusTaulu(i1) = osuusTaulu(i1)-osuus; +osuusTaulu(i2) = osuusTaulu(i2)+osuus; + + +%------------------------------------------------------------------------- + + +function [osuusTaulu, logml] = searchBest(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml) + +ready = 0; +while ready ~= 1 + muutokset = calcChanges(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml); + [maxMuutos, indeksi] = max(muutokset(1:end)); + if maxMuutos>0 + osuusTaulu = suoritaMuutos(osuusTaulu, osuus, indeksi); + logml = logml + maxMuutos; + else + ready = 1; + end +end + + + +%--------------------------------------------------------------------------- + + +function muutokset = calcChanges(osuus, osuusTaulu, ownCliqFreqs, ownSepFreqs, logml) +% Palauttaa npops*npops taulun, jonka alkio (i,j) kertoo, mik?on +% muutos logml:ss? mikäli populaatiosta i siirretään osuuden verran +% todennäkäisyysmassaa populaatioon j. Mikäli populaatiossa i ei ole +% mitään siirrettävää, on vastaavassa kohdassa rivi nollia. + +global COUNTS; +npops = size(COUNTS,3); + +notEmpty = find(osuusTaulu>0.005); +muutokset = zeros(npops); +empties = ~notEmpty; + +for i1=notEmpty + + osuusTaulu(i1) = osuusTaulu(i1)-osuus; + + for i2 = [1:i1-1 i1+1:npops] + osuusTaulu(i2) = osuusTaulu(i2)+osuus; + loggis = computeIndLikelihood(ownCliqFreqs, ownSepFreqs, osuusTaulu); + muutokset(i1,i2) = loggis-logml; + osuusTaulu(i2) = osuusTaulu(i2)-osuus; + end + + osuusTaulu(i1) = osuusTaulu(i1)+osuus; +end + + +%--------------------------------------------------------------- + + +function dispLine +disp('---------------------------------------------------'); + + +%-------------------------------------------------------------------------- + + +function tulostaAdmixtureTiedot(proportions, uskottavuus, alaRaja, niter) +h0 = findobj('Tag','filename1_text'); +inputf = get(h0,'String'); +h0 = findobj('Tag','filename2_text'); +outf = get(h0,'String'); clear h0; + +if length(outf)>0 + fid = fopen(outf,'a'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +ninds = length(uskottavuus); +npops = size(proportions,2); +disp(' '); +dispLine; +disp('RESULTS OF ADMIXTURE ANALYSIS BASED'); +disp('ON MIXTURE CLUSTERING OF INDIVIDUALS'); +disp(['Data file: ' inputf]); +disp(['Number of individuals: ' num2str(ninds)]); +disp(['Results based on ' num2str(niter) ' simulations from posterior allele frequencies.']); +disp(' '); +if fid ~= -1 + fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['--------------------------------------------']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['RESULTS OF ADMIXTURE ANALYSIS BASED']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['ON MIXTURE CLUSTERING OF INDIVIDUALS']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Data file: ' inputf]); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Number of individuals: ' num2str(ninds)]); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Results based on ' num2str(niter) ' simulations from posterior allele frequencies.']); fprintf(fid, '\n'); + fprintf(fid, '\n'); +end + +ekaRivi = blanks(6); +for pop = 1:npops + ekaRivi = [ekaRivi blanks(3-floor(log10(pop))) num2str(pop) blanks(2)]; +end +ekaRivi = [ekaRivi blanks(1) 'p']; % Added on 29.08.06 +disp(ekaRivi); +for ind = 1:ninds + rivi = [num2str(ind) ':' blanks(4-floor(log10(ind)))]; + if any(proportions(ind,:)>0) + for pop = 1:npops-1 + rivi = [rivi proportion2str(proportions(ind,pop)) blanks(2)]; + end + rivi = [rivi proportion2str(proportions(ind,npops)) ': ']; + rivi = [rivi ownNum2Str(uskottavuus(ind))]; + end + disp(rivi); + if fid ~= -1 + fprintf(fid,'%s \n',[rivi]); fprintf(fid,'\n'); + end +end +if fid ~= -1 + fclose(fid); +else + diary off +end + +%------------------------------------------------------ + +function str = proportion2str(prob) +%prob belongs to [0.00, 0.01, ... ,1]. +%str is a 4-mark presentation of proportion. + +if abs(prob)<1e-3 + str = '0.00'; +elseif abs(prob-1) < 1e-3; + str = '1.00'; +else + prob = round(100*prob); + if prob<10 + str = ['0.0' num2str(prob)]; + else + str = ['0.' num2str(prob)]; + end; +end; + +%------------------------------------------------------- + +function g=randga(a,b) +flag = 0; +if a>1 +c1 = a-1; c2 = (a-(1/(6*a)))/c1; c3 = 2/c1; c4 = c3+2; c5 = 1/sqrt(a); +U1=-1; +while flag == 0, +if a<=2.5, +U1=rand;U2=rand; +else +while ~(U1>0 & U1<1), +U1=rand;U2=rand; +U1 = U2 + c5*(1-1.86*U1); +end %while +end %if +W = c2*U2/U1; +if c3*U1+W+(1/W)<=c4, +flag = 1; +g = c1*W/b; +elseif c3*log(U1)-log(W)+W<1, +flag = 1; +g = c1*W/b; +else +U1=-1; +end %if +end %while flag +elseif a==1 +g=sum(-(1/b)*log(rand(a,1))); +else +while flag == 0, +U = rand(2,1); +if U(1)>exp(1)/(a+exp(1)), +g = -log(((a+exp(1))*(1-U(1)))/(a*exp(1))); +if U(2)<=g^(a-1), +flag = 1; +end %if +else +g = ((a+exp(1))*U(1)/((exp(1))^(1/a))); +if U(2)<=exp(-g), +flag = 1; +end %if +end %if +end %while flag +g=g/b; +end %if; + + +%------------------------------------------------- + +function svar=randdir(counts,nc) +% Käyttäesim randdir([10;30;60],3) + +svar=zeros(nc,1); +for i=1:nc + svar(i,1)=randga(counts(i,1),1); +end +svar=svar/sum(svar); + +%-------------------------------------------------- + +function waitALittle +A = rand(500); +gammaln(A); + + +%-------------------------------------------------- + +function [cliq_data, sep_data, cliq_counts, component_mat] = ... + createCliqData(data, gene_lengths, noalle, linkage_model, ... + rowsFromInd) +% cliq_data: cell array, each cell corresponds to one gene. Element (i,j) +% in cell k is the code of the allele combination in the j:th clique in +% gene k, for individual i. + +% sep_data: like cliq_data. i:th separator separates cliques i and i+1. + +% cliq_counts: cell array, each cell corresponds to one gene. Each cell is +% a 3-dimensional array, where the element (i,j,k) is the observed count of +% allele combination i, in clique j, in population k. + +% The coding of the allele combinations: If a clique of 3 sites has +% noalle values 3,2,4, then the allele combinations are given numbers in +% lexicographic order: 111, 112, 113, 114, 121, 122, ..., 324. + +%------------------------------------------------------------------------ + +% cliq_data on cell-array, jossa kukin solu vastaa yht?geeni? Alkio (i,j) +% solussa k merkitsee sen alleelikombinaation koodia, joka yksiläll?i +% havaitaan geenin k klikiss?numero j. + +% cliq_counts on cell-array, jossa myäs kukin solu vastaa yht?geeni? +% Kukin solu on kolmiulotteinen taulukko, jonka alkio (i,j,k) on +% populaatiossa k, ko geenin j:nness?klikiss?havaitun alleelikombinaation +% i lukumäär? + +% Alleelikombinaatioiden koodaus: Jos kolmen position klikiss?on (koko +% datassa) noalle:t 3,2,4, (eli ekassa positiossa alleelit 1-3, tokassa 1-2 +% ja kolmannessa alleelit 1-4), niin alleelikombinaatiot numeroidaan +% leksikograafisessa järjestyksess? 111, 112, 113, 114, 121, 122, ..., +% 324. + +%----------------------------------------------------------------------- + +global PARTITION; + +if sum(gene_lengths) ~= size(data,2) + disp('Error 155'); +end +if ~isa(data,'double') + data = double(data); % Required in matlab 6 +end + +ninds = size(data,1); +n_genes = length(gene_lengths); +if strcmp(linkage_model,'codon') + n_cliques = gene_lengths-2; % Number of cliques in each gene +else + n_cliques = gene_lengths-1; % Use linear model +end +max_noalle = zeros(n_genes,1); % Maximum "clique noalle" in each gene. + +component_mat = zeros(n_genes, max(gene_lengths)); +cum_length = cumsum(gene_lengths); +component_mat(1,1:gene_lengths(1))=1:gene_lengths(1); +for i = 2:n_genes + component_mat(i,1:gene_lengths(i)) = cum_length(i-1)+1:cum_length(i); +end + +for i = 1:n_genes + % What is the largest number of different values that are observed for + % some clique in this gene: + number = 0; + if n_cliques(i)<1 + % the gene is shorter than a normal clique. + if gene_lengths(i)==2 + % linkage_model must be 'codon' to end up here.. + positions = component_mat(i, [1 2]); + number = prod(noalle(positions)); + else + % gene_lengths(i) == 1 + positions = component_mat(i,1); + number = noalle(positions); + end + else + for j = 1:n_cliques(i) + if strcmp(linkage_model,'codon'), positions = component_mat(i , j:j+2); + else positions = component_mat(i, j:j+1); + end + + cand = prod(noalle(positions)); + if cand>number + number=cand; + end + end + end + max_noalle(i) = number; +end + +cliq_data = cell(n_genes,1); % An array for each gene. +% (i,j) is the combination which individual i has in clique j (in haploid case..). + +for i = 1:n_genes + if n_cliques(i)<1 + cliq_data{i} = zeros(ninds, 1); + if gene_lengths(i)==2 + % linkage_model must be 'codon' to end up here.. + positions = component_mat(i, [1 2]); + rows = data(:,positions); + observations = (rows(:,1)-1) * noalle(positions(2)) + rows(:,2); + else + positions = component_mat(i,1); + rows = data(:,positions); + observations = rows; + end + cliq_data{i}(:,1) = observations; + else + cliq_data{i} = zeros(ninds, n_cliques(i)); + for j = 1:n_cliques(i) + if strcmp(linkage_model,'codon') + positions = component_mat(i,j:j+2); + rows = data(:, positions); + observations = (rows(:,1)-1) * prod(noalle(positions(2:3))) + ... + (rows(:,2)-1) * noalle(positions(3)) + rows(:,3); + else + positions = component_mat(i,j:j+1); + rows = data(:,positions); + observations = (rows(:,1)-1) * noalle(positions(2)) + rows(:,2); + end + cliq_data{i}(:,j) = observations; + end + end +end + +cliq_counts = cell(n_genes,1); +% (i,j,k) is the count of combination i, in clique j, in population k. + +npops = length(unique(PARTITION)); +for i = 1:n_genes + cliq_counts{i} = zeros(max_noalle(i), max(1,n_cliques(i)), npops); + for j = 1:npops + partition = repmat(PARTITION', [rowsFromInd 1]); + partition = partition(:); % Partition for rows in the data (instead of individuals). + inds_now = find(partition==j); + ninds_now = length(inds_now); + data_now = cliq_data{i}(inds_now,:); + + for k = 1:max(n_cliques(i),1) + apu = zeros(ninds_now, max_noalle(i)); + apu(sub2ind([ninds_now max_noalle(i)],... + (1:ninds_now)', data_now(:,k)))=1; + cliq_counts{i}(:, k, j) = (sum(apu,1))'; + end + end +end + +sep_data = cell(n_genes,1); +n_separators = n_cliques-1; +for i = 1:n_genes + sep_data{i} = zeros(ninds, n_separators(i)); + for j = 1:n_separators(i) + if strcmp(linkage_model, 'codon') + positions = component_mat(i,j+1:j+2); + rows = data(:, positions); + observations = (rows(:,1)-1) * noalle(positions(2)) + rows(:,2); + else + positions = component_mat(i,j+1); + rows = data(:,positions); + observations = rows; + end + sep_data{i}(:,j) = observations; + end +end + + +%------------------------------------------------------ + + +function [cliq_freqs, sep_freqs] = simulateCliqFreqs(cliq_counts, noalle, component_mat, ... + gene_lengths, linkage_model) + +% cliq_freqs: cell-array. Element (i,j,k) in cell m is the frequence of +% combination i, in clique j of the m:th gene, in population k. + +% sep_freqs: like cliq_freqs, but for the separators. + +%------------------------------------------------------------------------ + +% cliq_freqs: cell-array, jossa on vastaavat dimensiot kuin cliq_counts:issa. +% solun m alkio (i,j,k) on geenin m, klikin j, kombinaation i, frekvenssi +% populaatiossa k. + +% sep_freqs: cell-array, kuten cl_freqs, mutta separaattoreille. + +%------------------------------------------------------------------------- + +global PARTITION; + +n_genes = length(cliq_counts); +if strcmp(linkage_model,'codon') + n_cliques = gene_lengths-2; % Number of cliques in each gene +else + n_cliques = gene_lengths-1; % Use linear model +end +npops = length(unique(PARTITION)); + +cliq_freqs = cell(n_genes,1); +sep_freqs = cell(n_genes,1); + +for i=1:n_genes + + cliq_freqs{i} = zeros(size(cliq_counts{i})); + + positions = component_mat(i,1:gene_lengths(i)); + + if n_cliques(i)<1 + % the gene is shorter than a normal clique. + if gene_lengths(i)==2 + % linkage_model must be 'codon' to end up here.. + cliq_noalle = noalle(positions(1)) .* noalle(positions(2)); + sep_noalle = []; + else + % gene_lengths(i) == 1 + cliq_noalle = noalle(positions(1)); + sep_noalle = []; + end + sep_freqs{i} = []; + else + if strcmp(linkage_model, 'codon') + cliq_noalle = noalle(positions(1:end-2)) .* noalle(positions(2:end-1)) .* ... + noalle(positions(3:end)); + sep_noalle = noalle(positions(2:end-2)) .* noalle(positions(3:end-1)); + else + cliq_noalle = noalle(positions(1:end-1)) .* noalle(positions(2:end)); + sep_noalle = noalle(positions(2:end-1)); + end + sep_freqs{i} = zeros(max(sep_noalle), n_cliques(i)-1, npops); + end + + % First clique: + prior = (1 / cliq_noalle(1)) * ones(cliq_noalle(1),1); + counts_now = repmat(prior, [1 1 npops]) + cliq_counts{i}(1:cliq_noalle(1),1,:); + for k = 1:npops + simul = randdir(counts_now(:,1,k), cliq_noalle(1)); + cliq_freqs{i}(1:cliq_noalle(1),1,k) = simul; + end + + for j=2:n_cliques(i) + % Obtain freqs for j-1:th separator by marginalization from j-1:th + % clique, and draw values for the frequencies of the j:th clique: + + aux = cliq_freqs{i}(1:cliq_noalle(j-1), j-1, :); % Freqs of the previous clique + aux = reshape(aux, [sep_noalle(j-1), noalle(positions(j-1)), npops]); + + % Freqs for separator by marginalization from the previous clique: + sep_freqs{i}(1:sep_noalle(j-1),j-1,:) = sum(aux,2); + + prior = (1 / cliq_noalle(j)) * ones(cliq_noalle(j),1); + counts_now = repmat(prior, [1 1 npops]) + cliq_counts{i}(1:cliq_noalle(j),j,:); + for k = 1:npops + % Simulate conditional frequencies: + for m = 1:sep_noalle(j-1) + if strcmp(linkage_model, 'codon') + values = (m-1)*noalle(positions(j+2))+1:m*noalle(positions(j+2)); + else + values = (m-1)*noalle(positions(j+1))+1:m*noalle(positions(j+1)); + end + simul = randdir(counts_now(values,1,k), length(values)); + cliq_freqs{i}(values,j,k) = simul * sep_freqs{i}(m,j-1,k); % MIETI TARKKAAN! + end + end + end +end + + +%-------------------------------------------------------------------- + + +function [ownCliqFreqs, ownSepFreqs] = computePersonalCliqueFreqs(... + ind, cl_data, cl_freqs, sep_data, sep_freqs, rowsFromInd, ... + gene_lengths, linkage_model) + +% ownCliqFreqs is (npops * (n_cliques*rowsFromInd)) table, where each column +% contains the frequencies of the corresponding clique_combination, in +% different populations. + +% ownSepFreqs is (npops * (n_seps*rowsFromInd)) table, like ownCliqFreqs. + +n_genes = length(gene_lengths); +if strcmp(linkage_model,'codon') + n_cliques = gene_lengths-2; + n_cliques = max([n_cliques ones(n_genes,1)], [], 2); % for genes shorter than clique +else + n_cliques = gene_lengths-1; % Use linear model. + n_cliques = max([n_cliques ones(n_genes,1)], [], 2); +end + +total_n_cliques = sum(n_cliques); +npops = size(cl_freqs{1},3); + +ownCliqFreqsXX = zeros(1, total_n_cliques*rowsFromInd, npops); + +pointer = 1; +for i = 1:n_genes + ind_data = cl_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd , :); + for j = 1:n_cliques(i) % MUUTA! + for k = 1:rowsFromInd + code = ind_data(k,j); + ownCliqFreqsXX(1,pointer,:) = cl_freqs{i}(code,j,:); + pointer = pointer+1; + end + end +end + +ownCliqFreqs = (squeeze(ownCliqFreqsXX))'; + +n_separators = n_cliques-1; +total_n_separators = sum(n_separators); + +ownSepFreqsXX = zeros(1, total_n_separators*rowsFromInd, npops); + +pointer = 1; +for i = 1:n_genes + ind_data = sep_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd , :); + for j = 1:n_separators(i) + for k = 1:rowsFromInd + code = ind_data(k,j); + ownSepFreqsXX(1,pointer,:) = sep_freqs{i}(code,j,:); + pointer = pointer+1; + end + end +end + +if (total_n_separators*rowsFromInd)==1 + ownSepFreqs = (squeeze(ownSepFreqsXX)); +else + ownSepFreqs = (squeeze(ownSepFreqsXX))'; +end + + +%------------------------------------------------------------------------- + + +function exp_cliq_freqs = computeExpectedFreqs(cliq_counts, ... + noalle, component_mat, gene_lengths, linkage_model) + +% Returns the expected values for the clique and separator frequencies in +% different populations. + +% exp_cliq_freqs: cell-array. Element (i,j,k) in cell m is the expected +% frequence of combination i, in clique j of the m:th gene, in +% population k. + +n_genes = length(gene_lengths); + +if strcmp(linkage_model, 'codon') + n_cliques = gene_lengths-2; +else + n_cliques = gene_lengths-1; % Linear model +end + +npops = size(cliq_counts{1},3); +exp_cliq_freqs = cell(n_genes,1); + +for i = 1:n_genes + + exp_cliq_freqs{i} = zeros(size(cliq_counts{i})); + positions = component_mat(i,1:gene_lengths(i)); + + if n_cliques(i)<1 + % the gene is shorter than a normal clique. + if gene_lengths(i)==2 + % linkage_model must be 'codon' to end up here.. + cliq_noalle = noalle(positions(1)) .* noalle(positions(2)); + else + % gene_lengths(i) == 1 + cliq_noalle = noalle(positions(1)); + end + else + if strcmp(linkage_model, 'codon') + cliq_noalle = noalle(positions(1:end-2)) .* noalle(positions(2:end-1)) .* ... + noalle(positions(3:end)); + else + cliq_noalle = noalle(positions(1:end-1)) .* noalle(positions(2:end)); + end + end + + for j = 1:max(1, n_cliques(i)) + prior = (1 / cliq_noalle(j)) * ones(cliq_noalle(j),1); + counts_now = repmat(prior, [1 1 npops]) + cliq_counts{i}(1:cliq_noalle(j),j,:); + exp_cliq_freqs{i}(1:cliq_noalle(j),j,:) = ... + counts_now ./ repmat(sum(counts_now,1), [cliq_noalle(j) 1 1]); + end +end + + +%---------------------------------------------------------- + + +function [ref_cliq_data, ref_sep_data] = ... + simulateLinkageIndividuals(n, rowsFromInd, exp_cliq_freqs, gene_lengths, ... + noalle, component_mat, linkage_model) + +% Simulates n individuals from each population using expected frequencies +% for cliques and separators. + +% ref_cliq_data: cell array, each cell corresponds to one gene. Elements +% ((i-1)*rowsFromInd+1:i*rowsFromInd, j) in cell k are the codes of the allele +% combinations in the j:th clique in gene k, for individual i. + +% ref_sep_data: like ref_cliq_data. i:th separator separates cliques i and i+1. + +n_genes = length(gene_lengths); +if strcmp(linkage_model,'codon') + n_cliques = gene_lengths-2; +else + n_cliques = gene_lengths-1; % Linear model +end +npops = size(exp_cliq_freqs{1},3); +ninds = n*npops; + +ref_cliq_data = cell(n_genes,1); +ref_sep_data = cell(n_genes,1); + +for i = 1:n_genes + ref_cliq_data{i} = zeros(ninds*rowsFromInd, max(n_cliques(i),1)); % Added: rowsFromInd + + positions = component_mat(i,1:gene_lengths(i)); + + if strcmp(linkage_model,'codon') + sep_noalle = noalle(positions(2:end-2)) .* noalle(positions(3:end-1)); + else + sep_noalle = noalle(positions(2:end-1)); + end + ref_sep_data{i} = zeros(ninds*rowsFromInd, n_cliques(i)-1); % Added: rowsFromInd + + for ind = 1:ninds + pop = ceil(ind/n); + + % First clique: + freqs = exp_cliq_freqs{i}(:,1,pop); + freqs = repmat(freqs, [1 rowsFromInd]); + codes = simulateCodes(freqs); + ref_cliq_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd, 1) = codes; + + for j = 2:n_cliques(i) + previous_cliq = ref_cliq_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd, j-1); + + % Value for j-1:th separator: + sep_codes = rem(previous_cliq, sep_noalle(j-1)); + sep_codes(find(sep_codes==0)) = sep_noalle(j-1); + ref_sep_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd, j-1) = sep_codes; + + % Value for j:th clique: + if strcmp(linkage_model,'codon') + freqs = zeros(noalle(positions(j+2)),rowsFromInd); + values = zeros(noalle(positions(j+2)),rowsFromInd); + for k = 1:rowsFromInd + values(:,k) = ((sep_codes(k)-1)*noalle(positions(j+2))+1 : sep_codes(k)*noalle(positions(j+2)))'; + freqs(:,k) = exp_cliq_freqs{i}(values(:,k), j, pop); + end + freqs = freqs ./ repmat(sum(freqs,1), [noalle(positions(j+2)) 1]); + else + freqs = zeros(noalle(positions(j+1)),rowsFromInd); + values = zeros(noalle(positions(j+1)),rowsFromInd); + for k = 1:rowsFromInd + values(:,k) = ((sep_codes(k)-1)*noalle(positions(j+1))+1 : sep_codes(k)*noalle(positions(j+1)))'; + freqs(:,k) = exp_cliq_freqs{i}(values(:,k), j, pop); + end + freqs = freqs ./ repmat(sum(freqs,1), [noalle(positions(j+1)) 1]); + end + codes = simulateCodes(freqs); + codes = values(sub2ind(size(values),codes,(1:rowsFromInd)')); + ref_cliq_data{i}((ind-1)*rowsFromInd+1:ind*rowsFromInd, j) = codes; + end + end +end + +function codes = simulateCodes(freqs) +% Freqs is a table where each column is a distribution. The +% number of columns in freqs must be equal to rowsFromInd. +% A value is drawn from each distribution in different columns. The values +% are saved in codes, which is (rowsFromInd*1) table. + +[nrows, rowsFromInd] = size(freqs); +codes = nrows+1-sum(cumsum(freqs)>repmat(rand(1,rowsFromInd),[nrows 1]),1); +codes = codes'; \ No newline at end of file diff --git a/matlab/parallel/linkage_parallel.m b/matlab/parallel/linkage_parallel.m new file mode 100644 index 0000000..f8d196d --- /dev/null +++ b/matlab/parallel/linkage_parallel.m @@ -0,0 +1,1633 @@ +function linkage_parallel(options) +% LINKAGE_PARALLEL is the command line version of the baps partition with +% linkage models. +% Input: options is a struct generated by parallel.m + +%-------------------------------------------------------------------------- +%- Syntax check out +%-------------------------------------------------------------------------- +outp = [options.outputMat '.txt']; +inp = options.dataFile; + +if strcmp(options.fixedK, 'yes') + fixedK = 1; +else + fixedK = 0; +end + +switch options.mixtureType + case 'linear_mix' + linkage_model = 'linear'; + case 'codon_mix' + linkage_model = 'codon'; + case '' + error('*** ERROR: ''modeltype'' not specified.'); + otherwise + error('*** ERROR: unknown modeltype.'); +end + +%-------------------------------------------------------------------------- +%- Get data file location +%-------------------------------------------------------------------------- +switch options.dataType + case {'numeric', 'sequence'} + if isempty(options.linkageMap) + error('*** ERROR: ''linkagemap'' not specified.'); + end + if strcmp(options.dataType, 'numeric') + try + data = load(options.dataFile); + catch + disp('*** ERROR: Incorrect BAPS numerical data.'); + return + end + else + %[data, filename] = readbaps(inp); + [data, filename] = silentReadBaps(inp); % modified by Lu Cheng, 29.06.2010 + if isempty(data) + return + end + end + + %------------------------------------------------------------------ + %- Get name and index file location + %------------------------------------------------------------------ + if ~isempty(options.nameFile) & ~isempty(options.indexFile) + popnames = initPopNames(options.nameFile{1}, options.indexFile{1}); + else + popnames(:,1) = num2cell(unique(data(:,end))); + popnames(:,2) = popnames(:,1); + ninds = max(data(:,end)); + popnames = fixPopnames(popnames, ninds); + end + + if isempty(data) + error('*** ERROR: Failed in loading the data'); + end + data = uint16(data); + % Check that the data is rational: + isRational = isTheLoadedNewDataRational(data); + if isRational == 0 + error('*** ERROR: failed in loading the data.'); + else + display(['# of haplotypes: ', num2str(size(data,1))]); + display(['# of loci: ', num2str(size(data,2)-1)]); + % display('Finished.'); + end; + + % Check if the data is discrete or continuous + if any(any(fix(data)~=data)) + disp('Found decimal numbers. Continuous model will be used.'); + error('** ERROR: continuous model is under construction.'); + end + + display('---------------------------------------------------'); + % Load the linkage map + display(['Reading linkage map from: ',options.linkageMap,'...']); + try + component_mat = load(options.linkageMap); + catch + disp('*** ERROR: Incorrect linkage map.'); + return; + end + % Check if the linkage map matches the data + if (size(data,2)-1) ~= max(component_mat(:)) + disp('*** ERROR: Incosistent linkage map.'); + return; + else + display(['# of linkage groups: ', num2str(size(component_mat,1))]); + end; + + display('---------------------------------------------------'); + fprintf(1,'Preprocessing the data ...\n'); + % Make the missing data complete + data = makecomplete(data); + if isempty(data) + display('*** ERROR: Failed in completing the missing data'); + return; + end + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(data); + + % Distance between individuals is computed as if the loci are + % independent. + fprintf(1, 'Caculating the distances ...'); + [Z,dist] = newGetDistances(data,rowsFromInd); + fprintf(1,'Finished.\n'); + + c.data = uint16(data); c.rowsFromInd = rowsFromInd; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.dist = dist; c.popnames = popnames; + c.Z = Z; + + + c.component_mat = component_mat; + + % Data transformation + % display('---------------------------------------------------'); + fprintf(1,'Transforming the data ...'); + index = data(:,end); + [data_clique, data_separator, noalle_clique, noalle_separator] = ... + transform4(data, component_mat, linkage_model); + data_clique = [data_clique index]; + data_separator = [data_separator index]; + + [counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq]... + = allfreqsnew2(data_clique, double(noalle_clique)); + clear data_clique; + [counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp]... + = allfreqsnew2(data_separator, double(noalle_separator)); + clear data_separator; + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + fprintf(1,'Finished.\n'); + clear c; + case 'matlab' + struct_array = load(options.dataFile); + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'dist') + display('*** ERROR: Incorrect file format'); + return + end + clear struct_array; + elseif isfield(struct_array,'dist') %Mideva versio + c = struct_array; + clear struct_array; + else + display('*** ERROR: Incorrect file format'); + return; + end + % The following are the same as in the basic clustering + data = c.data; popnames = c.popnames; Z = c.Z; + noalle = c.noalle; adjprior = c.adjprior; + rowsFromInd = c.rowsFromInd; alleleCodes = c.alleleCodes; + dist = c.dist; priorTerm = c.priorTerm; + + if ~isfield(c,'component_mat') + display('*** ERROR: Incorrect linkage data'); + return + end + + % This is new + component_mat = c.component_mat; + data = uint16(data); + + display(['# of haplotypes: ', num2str(size(data,1))]); + display(['# of loci: ', num2str(size(data,2)-1)]); + display(['# of linkage groups: ', num2str(size(component_mat,1))]); + + if strcmpi(options.mixtureType, 'codon_mix') + linkage_model = 'codon'; + else + linkage_model = 'linear'; + end + if ~isfield(c, 'mixtureType') + clear c; % save the memory usage + % Data transformation + fprintf(1,'Transforming the data ...'); + index = data(:,end); + % [data_clique, data_separator, noalle_clique, noalle_separator] = ... + % transform2(data, component_mat, linkage_model); + [data_clique, data_separator, noalle_clique, noalle_separator] = ... + transform4(data, component_mat, linkage_model); + data_clique = [data_clique index]; + data_separator = [data_separator index]; + + [counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq]... + = allfreqsnew2(data_clique, double(noalle_clique)); + clear data_clique; + [counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp]... + = allfreqsnew2(data_separator, double(noalle_separator)); + clear data_separator; + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + fprintf(1,'Finished.\n'); + else + if ~strcmpi(c.mixtureType, options.mixtureType) + error('*** ERROR: incorrect mixture type'); + end + %Linkage model is specified in the preprocessed file. + counts_cq = c.counts_cq; adjprior_cq = c.adjprior_cq; + counts_sp = c.counts_sp; adjprior_sp = c.adjprior_sp; + linkage_model = c.linkage_model; + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + clear c; + display(['linkage model: ', linkage_model]); + end + + + case 'excel' + display('---------------------------------------------------'); + display(['Reading sequence profile from: ', options.dataFile,'...']); + [data, component_mat, popnames] = processxls(options.dataFile); + if isempty(data) + display('*** ERROR: Failed in loading the data'); + return; + end + + display('---------------------------------------------------'); + fprintf(1,'Preprocessing the data ...\n'); + + % Make the missing data complete + % missing values are denoted as -999 + data = uint16(data); + % data = uint8(data); + data = makecomplete(data); + if isempty(data) + display('*** ERROR: Failed in completing the missing data'); + return; + end + + isRational = isTheLoadedNewDataRational(data); + if isRational == 0 + return; + else + display(['# of haplotypes: ', num2str(size(data,1))]); + display(['# of loci: ', num2str(size(data,2)-1)]); + end + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(data); + + + % Distance between individuals is computed as if the loci are + % independent. + [Z,dist] = newGetDistances(data,rowsFromInd); + fprintf(1,'Finished.\n'); + ninds = max(data(:,end)); + popnames = fixPopnames(popnames, ninds); + + c.data = uint16(data); c.rowsFromInd = rowsFromInd; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.popnames = popnames; c.component_mat = component_mat; + c.dist = dist; c.Z = Z; + + % Data transformation + display('---------------------------------------------------'); + fprintf(1,'Transforming the data ...'); + index = data(:,end); + [data_clique, data_separator, noalle_clique, noalle_separator] = ... + transform4(data, component_mat, linkage_model); + data_clique = [data_clique index]; + data_separator = [data_separator index]; + + [counts_cq, nalleles_cq, prior_cq, adjprior_cq, genotypes_cq]... + = allfreqsnew2(data_clique, double(noalle_clique)); + clear data_clique; + [counts_sp, nalleles_sp, prior_sp, adjprior_sp, genotypes_sp]... + = allfreqsnew2(data_separator, double(noalle_separator)); + clear data_separator; + counts_cq = uint8(counts_cq); + counts_sp = uint8(counts_sp); + fprintf(1,'Finished.\n'); + clear c; + otherwise + error('*** ERROR: data type is not specified or unknown.'); +end + + +global POP_LOGML; global PARTITION; +global CQ_COUNTS; global SP_COUNTS; %These counts are for populations +global CQ_SUMCOUNTS; global SP_SUMCOUNTS; %not for individuals +clearGlobalVars; +npopstext = []; +npopstextExtra = options.initialK; +if length(npopstextExtra)>=255 + npopstextExtra = npopstextExtra(1:255); + npopstext = [npopstext ' ' npopstextExtra]; + teksti = 'The input field length limit (255 characters) was reached. Input more values: '; +else + % ----------------------------------------------------- + % Set the limit of the input value. + % Modified by Jing Tang, 30.12.2005 + if max(npopstextExtra) > size(data,1) + error('Values larger than the sample size are not accepted. '); + else + npopstext = [npopstext ' ' num2str(npopstextExtra)]; + end +end + +clear ready; clear teksti; +if isempty(npopstext) || length(npopstext)==1 + return +else + npopsTable = str2num(npopstext); + % ykkoset = find(npopsTable==1); + npopsTable(logical(npopsTable==1)) = []; + if isempty(npopsTable) + return + end + % clear ykkoset; +end + +c.noalle = noalle; +c.adjprior = adjprior; %priorTerm = c.priorTerm; +c.rowsFromInd = rowsFromInd; +c.counts_cq = counts_cq; c.adjprior_cq = adjprior_cq; +c.counts_sp = counts_sp; c.adjprior_sp = adjprior_sp; +c.dist = dist; c.Z = Z; + +if fixedK + % Only the first value of npopsTaulu is used + npops = npopsTable(1); + nruns = length(npopsTable); + + % [logml, npops, partitionSummary]=linkageMix_fixK(c,npops,nruns,1); + display('*** ERROR: fixed K for linkage module is not available.'); + return +else + [logml, npops, partitionSummary]=linkageMix(c,npopsTable); +end + +if logml==1 + return +end + +data = noIndex(data,noalle); + +%This is basically the same as in BAPS 3. +changesInLogml = writeMixtureInfo(logml, counts_cq, counts_sp, adjprior_cq, ... + adjprior_sp, outp, inp, partitionSummary, popnames, linkage_model, ... + fixedK); + +if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',outp) + delete('baps4_output.baps') +end + +[sumcounts, counts] = indLociCounts(PARTITION, data, npops, noalle); +% NB! Index column is removed in data matrix. +c.PARTITION = PARTITION; c.CQ_COUNTS = CQ_COUNTS; c.CQ_SUMCOUNTS = CQ_SUMCOUNTS; +c.SP_COUNTS = SP_COUNTS; c.SP_SUMCOUNTS = SP_SUMCOUNTS; +c.alleleCodes = alleleCodes; c.adjprior_cq = adjprior_cq; c.adjprior_sp = adjprior_sp; c.popnames = popnames; +c.rowsFromInd = rowsFromInd; c.data = uint16(data); c.npops = npops; +% c.nalleles_cq = nalleles_cq; c.nalleles_sp = nalleles_sp; +if strcmp(linkage_model,'linear') % Added on 03.11.06 + c.mixtureType = 'linear_mix'; +elseif strcmp(linkage_model,'codon') + c.mixtureType = 'codon_mix'; +end +c.changesInLogml = changesInLogml; % this variable stores the change of likelihoods. +% [ncluster ninds] +% -Added on 02.11.2006 + +% The next ones are for the admixture input +c.COUNTS = counts; c.SUMCOUNTS = sumcounts; +c.adjprior = adjprior; c.rowsFromInd = rowsFromInd; c.noalle = noalle; c.alleleCodes = alleleCodes; + +% The two variables below are for the new linkage admixture model +c.linkage_model = linkage_model; +c.gene_lengths = calcGeneLengths(component_mat); + +% The logml is saved for parallel computing +c.logml = logml; + +fprintf(1,'Saving the result...') +try +% save(options.outputMat, 'c'); + save(options.outputMat, 'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + fprintf(1,'Finished.\n'); +catch + display('*** ERROR in saving the result.'); +end + + + + +%-------------------------------------------------------------------------- +% The next three functions are for computing the initial partition +% according to the distance between the individuals + +function initial_partition=admixture_initialization(nclusters,Z) +T=cluster_own(Z,nclusters); +initial_partition=T; + +%-------------------------------------------------------------------------- +function T = cluster_own(Z,nclust) +% true=logical(1); +% false=logical(0); + +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); +% maximum number of clusters based on inconsistency +if m <= maxclust + T = (1:m)'; +elseif maxclust==1 + T = ones(m,1); +else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end +end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + +%-------------------------------------------------------------------------- + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäär? + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + +%-------------------------------------------------------------------------- + +function Z = computeLinkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 || m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +% monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + +%-------------------------------------------------------------------------- + +function changes = computeChanges(ind, adjprior_cq, adjprior_sp, ... + indCqCounts, indSpCounts) +% Computes changes in log-marginal likelihood if individual ind is +% moved to another population +% +% Input: +% ind - the individual to be moved +% adjprior_cq & _sp - adjpriors for cliques and separators +% indCqCounts, indSpCounts - counts for individual ind +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% ind is move to population i. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; +npops = size(CQ_COUNTS,3); +changes = zeros(npops,1); + +i1 = PARTITION(ind); +i1_logml = POP_LOGML(i1); +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +new_i1_logml = computePopulationLogml(i1, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)+indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)+sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)+indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)+sumSp; + + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:) + repmat(sumSp,[npops-1 1]); + +new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:) - repmat(sumSp,[npops-1 1]); +% a = repmat(sumSp,[npops-1 1]); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + +%------------------------------------------------------------------------------------ + +function changes = computeChanges2(i1, adjprior_cq, adjprior_sp) +% Computes changes in log marginal likelihood if population i1 is combined +% with another population +% +% Input: +% i1 - the population to be combined +% adjprior_cq & _sp - adjpriors for cliques and separators +% +% Output: +% changes - table of size 1*npops. changes(i) = difference in logml if +% i1 is combined with population i. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; +npops = size(CQ_COUNTS,3); +changes = zeros(npops,1); + +i1_logml = POP_LOGML(i1); +indCqCounts = CQ_COUNTS(:,:,i1); +indSpCounts = SP_COUNTS(:,:,i1); +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +new_i1_logml = 0; + +i2 = [1:i1-1 , i1+1:npops]; +i2_logml = POP_LOGML(i2); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ repmat(sumSp,[npops-1 1]); +% a = repmat(sumSp,[npops-1 1]); +% if ~any(sumSp) +% a(:,[1:size(a,2)])=[]; +% end +% SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ a ; + + +new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)- repmat(sumSp,[npops-1 1]); + +changes(i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + + + + +%------------------------------------------------------------------------------------ + + +function changes = computeChanges3(T2, inds2, i1, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Computes changes in log marginal likelihood if subpopulation of i2 is +% moved to another population +% +% Input: +% T2 - partition of inds2 to subpopulations +% inds2 - individuals in population i1 +% i2 +% counts_cq, counts_sp - counts for individuals +% +% Output: +% changes - table of size length(unique(T2))*npops. +% changes(i,j) = difference in logml if subpopulation inds2(find(T2==i)) of +% i2 is moved to population j + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; +npops = size(CQ_COUNTS,3); +npops2 = length(unique(T2)); +changes = zeros(npops2,npops); + +%cq_counts = CQ_COUNTS; +%sp_counts = SP_COUNTS; +%cq_sumcounts = CQ_SUMCOUNTS; +%sp_sumcounts = SP_SUMCOUNTS; + + +i1_logml = POP_LOGML(i1); + +for pop2 = 1:npops2 + % inds = inds2(find(T2==pop2)); + inds = inds2(logical(T2==pop2)); + ninds = length(inds); + if ninds>0 + indCqCounts = uint16(sum(counts_cq(:,:,inds),3)); + indSpCounts = uint16(sum(counts_sp(:,:,inds),3)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; + CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; + SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + + new_i1_logml = computePopulationLogml(i1, adjprior_cq, adjprior_sp); + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)+indCqCounts; + CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)+sumCq; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)+indSpCounts; + SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)+sumSp; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML(i2)'; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+repmat(indCqCounts, [1 1 npops-1]); + CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+repmat(sumCq,[npops-1 1]); + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+repmat(indSpCounts, [1 1 npops-1]); + SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+ repmat(sumSp,[npops-1 1]); + + new_i2_logml = computePopulationLogml(i2, adjprior_cq, adjprior_sp)'; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)-repmat(indCqCounts, [1 1 npops-1]); + CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)-repmat(sumCq,[npops-1 1]); + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)-repmat(indSpCounts, [1 1 npops-1]); + SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)- repmat(sumSp,[npops-1 1]); + + changes(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + +%-------------------------------------------------------------------------- + +function changes = computeChanges5(inds, i1, i2, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Computes change in logml if individual of inds is moved between +% populations i1 and i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global POP_LOGML; global PARTITION; + +ninds = length(inds); +changes = zeros(ninds,1); + +i1_logml = POP_LOGML(i1); +i2_logml = POP_LOGML(i2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %from + pop2 = i2; %to + else + pop1 = i2; + pop2 = i1; + end + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,pop1) = CQ_COUNTS(:,:,pop1)-indCqCounts; + CQ_SUMCOUNTS(pop1,:) = CQ_SUMCOUNTS(pop1,:)-sumCq; + SP_COUNTS(:,:,pop1) = SP_COUNTS(:,:,pop1)-indSpCounts; + SP_SUMCOUNTS(pop1,:) = SP_SUMCOUNTS(pop1,:) - sumSp; + + CQ_COUNTS(:,:,pop2) = CQ_COUNTS(:,:,pop2)+indCqCounts; + CQ_SUMCOUNTS(pop2,:) = CQ_SUMCOUNTS(pop2,:)+sumCq; + SP_COUNTS(:,:,pop2) = SP_COUNTS(:,:,pop2)+indSpCounts; + SP_SUMCOUNTS(pop2,:) = SP_SUMCOUNTS(pop2,:) + sumSp; + + new_logmls = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + changes(i) = sum(new_logmls); + + CQ_COUNTS(:,:,pop1) = CQ_COUNTS(:,:,pop1)+indCqCounts; + CQ_SUMCOUNTS(pop1,:) = CQ_SUMCOUNTS(pop1,:)+sumCq; + SP_COUNTS(:,:,pop1) = SP_COUNTS(:,:,pop1)+indSpCounts; + SP_SUMCOUNTS(pop1,:) = SP_SUMCOUNTS(pop1,:)+sumSp; + CQ_COUNTS(:,:,pop2) = CQ_COUNTS(:,:,pop2)-indCqCounts; + CQ_SUMCOUNTS(pop2,:) = CQ_SUMCOUNTS(pop2,:)-sumCq; + SP_COUNTS(:,:,pop2) = SP_COUNTS(:,:,pop2)-indSpCounts; + SP_SUMCOUNTS(pop2,:) = SP_SUMCOUNTS(pop2,:)-sumSp; +end + +changes = changes - i1_logml - i2_logml; + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, indCqCounts, indSpCounts, ... + adjprior_cq, adjprior_sp) +% Updates global variables when individual ind is moved to population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+indCqCounts; +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+sumCq; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+indSpCounts; +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+sumSp; + + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2(i1, i2, adjprior_cq, adjprior_sp) +% Updates global variables when all individuals from population i1 are moved +% to population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; + +% inds = find(PARTITION==i1); +% PARTITION(inds) = i2; +PARTITION(logical(PARTITION==i1)) = i2; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+CQ_COUNTS(:,:,i1); +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+CQ_SUMCOUNTS(i1,:); +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+SP_COUNTS(:,:,i1); +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+SP_SUMCOUNTS(i1,:); + +CQ_COUNTS(:,:,i1) = 0; +CQ_SUMCOUNTS(i1,:) = 0; +SP_COUNTS(:,:,i1) = 0; +SP_SUMCOUNTS(i1,:) = 0; + +POP_LOGML(i1) = 0; +POP_LOGML(i2) = computePopulationLogml(i2, adjprior_cq, adjprior_sp); + + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, i2, indCqCounts, indSpCounts, ... + adjprior_cq, adjprior_sp) +% Updates global variables when individuals muuttuvat are moved to +% population i2 + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; +global PARTITION; global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +sumCq = uint16(sum(indCqCounts,1)); +sumSp = uint16(sum(indSpCounts,1)); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1)-indCqCounts; +CQ_SUMCOUNTS(i1,:) = CQ_SUMCOUNTS(i1,:)-sumCq; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1)-indSpCounts; +SP_SUMCOUNTS(i1,:) = SP_SUMCOUNTS(i1,:)-sumSp; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2)+indCqCounts; +CQ_SUMCOUNTS(i2,:) = CQ_SUMCOUNTS(i2,:)+sumCq; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2)+indSpCounts; +SP_SUMCOUNTS(i2,:) = SP_SUMCOUNTS(i2,:)+sumSp; + + +POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior_cq, adjprior_sp); + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, counts_cq, counts_sp, ... + adjprior_cq, adjprior_sp) +% Returns individuals inds in order according to the change in the logml if +% they are moved out of the population pop + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind = inds(i); + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + sumCq = uint16(sum(indCqCounts,1)); + sumSp = uint16(sum(indSpCounts,1)); + + CQ_COUNTS(:,:,pop) = CQ_COUNTS(:,:,pop)-indCqCounts; + CQ_SUMCOUNTS(pop,:) = CQ_SUMCOUNTS(pop,:)-sumCq; + SP_COUNTS(:,:,pop) = SP_COUNTS(:,:,pop)-indSpCounts; + SP_SUMCOUNTS(pop,:) = SP_SUMCOUNTS(pop,:)-sumSp; + + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior_cq, adjprior_sp); + + CQ_COUNTS(:,:,pop) = CQ_COUNTS(:,:,pop)+indCqCounts; + CQ_SUMCOUNTS(pop,:) = CQ_SUMCOUNTS(pop,:)+sumCq; + SP_COUNTS(:,:,pop) = SP_COUNTS(:,:,pop)+indSpCounts; + SP_SUMCOUNTS(pop,:) = SP_SUMCOUNTS(pop,:)+sumSp; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + + +%------------------------------------------------------------------------- + +function [Z, dist] = newGetDistances(data, rowsFromInd) + +ninds = max(data(:,end)); +nloci = size(data,2)-1; +riviLkm = nchoosek(double(ninds),2); + +% empties = find(data<0); +% data(empties)=0; +data(logical(data<0)) = 0; +data = uint16(data); + +pariTaulu = zeros(riviLkm,2); +aPointer=1; + +for a=1:ninds-1 + pariTaulu(aPointer:aPointer+double(ninds-1-a),1) = ones(ninds-a,1,'uint16')*a; + pariTaulu(aPointer:aPointer+double(ninds-1-a),2) = uint16((a+1:ninds)'); + aPointer = aPointer+double(ninds-a); +end + +eka = pariTaulu(:,ones(1,rowsFromInd)); +eka = eka * rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]); +eka = eka - miinus; + +toka = pariTaulu(:,ones(1,rowsFromInd)*2); +toka = toka * rowsFromInd; +toka = toka - miinus; + +eka = uint16(eka); +toka = uint16(toka); + +clear pariTaulu; clear miinus; + +summa = uint16(zeros(riviLkm,1)); +vertailuja = uint16(zeros(riviLkm,1)); + +x = zeros(size(eka)); x = uint16(x); +y = zeros(size(toka)); y = uint16(y); +% fprintf(1,'%%10'); +for j=1:nloci; + + for k=1:rowsFromInd + x(:,k) = data(eka(:,k),j); + y(:,k) = data(toka(:,k),j); + end + + for a=1:rowsFromInd + for b=1:rowsFromInd + vertailutNyt = uint16(x(:,a)>0 & y(:,b)>0); + vertailuja = vertailuja + vertailutNyt; + lisays = (x(:,a)~=y(:,b) & vertailutNyt); + summa = summa + uint16(lisays); + end + end + % fprintf(1,'\b\b'); + % fprintf(1,'%d',floor(10+80*j/nloci)); +end + +clear x; clear y; clear vertailutNyt; +dist = zeros(length(vertailuja),1); +% nollat = find(vertailuja==0); +% dist(nollat) = 1; +dist(logical(vertailuja==0)) = 1; +muut = find(vertailuja>0); +dist(muut) = double(summa(muut))./double(vertailuja(muut)); +clear summa; clear vertailuja; + +Z = computeLinkage(dist'); +% fprintf(1,'\b\b'); +% fprintf(1,'%d\n',100); +%-------------------------------------------------------------------------- + +function clearGlobalVars + +global CQ_COUNTS; CQ_COUNTS = []; +global CQ_SUMCOUNTS; CQ_SUMCOUNTS = []; +global SP_COUNTS; SP_COUNTS = []; +global SP_SUMCOUNTS; SP_SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global POP_LOGML; POP_LOGML = []; + +%-------------------------------------------------------------------------- + +function npops = removeEmptyPops +% Removes empty pops from all global COUNTS variables. +% Updates PARTITION and npops + +global CQ_COUNTS; +global CQ_SUMCOUNTS; +global SP_COUNTS; +global SP_SUMCOUNTS; +global PARTITION; + +notEmpty = find(any(CQ_SUMCOUNTS,2)); +CQ_COUNTS = CQ_COUNTS(:,:,notEmpty); +CQ_SUMCOUNTS = CQ_SUMCOUNTS(notEmpty,:); +SP_COUNTS = SP_COUNTS(:,:,notEmpty); +SP_SUMCOUNTS = SP_SUMCOUNTS(notEmpty,:); + +for n=1:length(notEmpty) +% apu = find(PARTITION==notEmpty(n)); +% PARTITION(apu)=n; +PARTITION(logical(PARTITION==notEmpty(n))) = n; +end +npops = length(notEmpty); + +%-------------------------------------------------------------------------- + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. +global PARTITION; +apu = isempty(find(abs(partitionSummary(:,2)-logml)<1e-5,1)); +if apu + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + +%-------------------------------------------------------------------------- + +function [counts, sumcounts] = initialCounts(ind_counts) + +global PARTITION; + +pops = unique(PARTITION); +npops = max(pops); + +counts = zeros(size(ind_counts,1), size(ind_counts,2), npops,'uint16'); +sumcounts = zeros(npops, size(ind_counts,2),'uint16'); + +for i = 1:npops + inds = find(PARTITION == i); + counts(:,:,i) = sum(ind_counts(:,:,inds), 3); + sumcounts(i,:) = sum(counts(:,:,i),1); +end + +%-------------------------------------------------------------------------- + +function logml = computeLogml(adjprior_cq, adjprior_sp) + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +cq_counts = double(CQ_COUNTS); +cq_sumcounts = double(CQ_SUMCOUNTS); +sp_counts = double(SP_COUNTS); +sp_sumcounts = double(SP_SUMCOUNTS); + +npops = size(CQ_COUNTS, 3); + +cq_logml = sum(sum(sum(gammaln(cq_counts+repmat(adjprior_cq,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior_cq))) - ... + sum(sum(gammaln(1+cq_sumcounts))); + +sp_logml = sum(sum(sum(gammaln(sp_counts+repmat(adjprior_sp,[1 1 npops]))))) ... + - npops*sum(sum(gammaln(adjprior_sp))) - ... + sum(sum(gammaln(1+sp_sumcounts))); + +logml = cq_logml - sp_logml; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%-------------------------------------------------------------------------- + +function popLogml = computePopulationLogml(pops, adjprior_cq, adjprior_sp) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global CQ_COUNTS; global CQ_SUMCOUNTS; +global SP_COUNTS; global SP_SUMCOUNTS; + +cq_counts = double(CQ_COUNTS); +cq_sumcounts = double(CQ_SUMCOUNTS); +sp_counts = double(SP_COUNTS); +sp_sumcounts = double(SP_SUMCOUNTS); + +nall_cq = size(CQ_COUNTS,1); +nall_sp = size(SP_COUNTS, 1); +ncliq = size(CQ_COUNTS,2); +nsep = size(SP_COUNTS, 2); + +z = length(pops); + +popLogml_cq = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_cq,[1 1 z]) + cq_counts(:,:,pops)) ... + ,[nall_cq ncliq z]),1),2)) - sum(gammaln(1+cq_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_cq))); + +popLogml_sp = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior_sp,[1 1 z]) + sp_counts(:,:,pops)) ... + ,[nall_sp nsep z]),1),2)) - sum(gammaln(1+sp_sumcounts(pops,:)),2) - ... + sum(sum(gammaln(adjprior_sp))); + +popLogml = popLogml_cq - popLogml_sp; +clear cq_counts cq_sumcounts sp_counts sp_sumcounts; + +%------------------------------------------------------------------- + + +function changesInLogml = writeMixtureInfo(logml, counts_cq, counts_sp, adjprior_cq, ... + adjprior_sp, outPutFile, inputFile, partitionSummary, popnames, linkage_model, ... + fixedK) + +global PARTITION; +global CQ_COUNTS; +global LOGDIFF; + +%global CQ_SUMCOUNTS; +%global SP_COUNTS; global SP_SUMCOUNTS; +ninds = length(PARTITION); +npops = size(CQ_COUNTS,3); +names = (size(popnames,1) == ninds); %Tarkistetaan ett?nimet viittaavat yksilöihin + +if length(outPutFile)>0 + fid = fopen(outPutFile,'w'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:'); +disp(['Data file/ Linkage map: ' inputFile]); +disp(['Model: ' linkage_model]); +disp(['Number of clustered individuals: ' ownNum2Str(ninds)]); +disp(['Number of groups in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%s \n', ['RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Data file: ' inputFile]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clustered individuals: ' ownNum2Str(ninds)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of groups in optimal partition: ' ownNum2Str(npops)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); fprintf(fid,'\n'); +end + +cluster_count = length(unique(PARTITION)); +disp('Best Partition: '); +if (fid ~= -1) + fprintf(fid,'%s \n','Best Partition: '); fprintf(fid,'\n'); +end +for m=1:cluster_count + indsInM = find(PARTITION==m); + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + if names + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + else + text = ['Cluster ' num2str(m) ': {' num2str(indsInM(1))]; + for k = 2:cluster_size + text = [text ', ' num2str(indsInM(k))]; + end; + end + text = [text '}']; + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',new_line); + fprintf(fid,'\n'); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',text); + fprintf(fid,'\n'); + end + end; +end + +if npops == 1 + changesInLogml = []; +else + disp(' '); + disp(' '); + disp('Changes in log(marginal likelihood) if indvidual i is moved to group j:'); + if (fid ~= -1) + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', 'Changes in log(marginal likelihood) if indvidual i is moved to group j:'); fprintf(fid, '\n'); + end + + if names + nameSizes = zeros(ninds,1); + for i = 1:ninds + nimi = char(popnames{i}); + nameSizes(i) = length(nimi); + end + maxSize = max(nameSizes); + maxSize = max(maxSize, 5); + erotus = maxSize - 5; + alku = blanks(erotus); + ekarivi = [alku ' ind' blanks(6+erotus)]; + else + ekarivi = ' ind '; + end + + for i = 1:cluster_count + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; + end + disp(ekarivi); + if (fid ~= -1) + fprintf(fid, '%s \n', ekarivi); fprintf(fid, '\n'); + end + + %ninds = size(data,1)/rowsFromInd; + changesInLogml = LOGDIFF'; + for ind = 1:ninds + indCqCounts = uint16(counts_cq(:,:,ind)); + indSpCounts = uint16(counts_sp(:,:,ind)); + %changesInLogml(:,ind) = computeChanges(ind, adjprior_cq, ... + % adjprior_sp, indCqCounts, indSpCounts); + if names + nimi = char(popnames{ind}); + rivi = [blanks(maxSize - length(nimi)) nimi ':']; + else + rivi = [blanks(4-floor(log10(ind))) ownNum2Str(ind) ':']; + end + for j = 1:npops + rivi = [rivi ' ' logml2String(omaRound(changesInLogml(j,ind)))]; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', rivi); fprintf(fid, '\n'); + end + end + + + % % KL-divergence has to be calculated otherwise... + % % { + % disp(' '); disp(' '); + % disp('KL-divergence matrix:'); + % + % if (fid ~= -1) + % fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + % fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + % fprintf(fid, '%s \n', ['KL-divergence matrix:']); fprintf(fid, '\n'); + % end + % + % maxnoalle = size(COUNTS,1); + % nloci = size(COUNTS,2); + % d = zeros(maxnoalle, nloci, npops); + % prior = adjprior; + % prior(find(prior==1))=0; + % nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. + % prior(1,nollia)=1; + % for pop1 = 1:npops + % d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + % dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); + % end + % ekarivi = blanks(7); + % for pop = 1:npops + % ekarivi = [ekarivi num2str(pop) blanks(7-floor(log10(pop)))]; + % end + % disp(ekarivi); + % if (fid ~= -1) + % fprintf(fid, '%s \n', [ekarivi]); fprintf(fid, '\n'); + % end + % + % for pop1 = 1:npops + % rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + % for pop2 = 1:pop1-1 + % dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + % div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + % div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + % div = (div12+div21)/2; + % rivi = [rivi kldiv2str(div) ' ']; + % end + % disp(rivi); + % if (fid ~= -1) + % fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + % end + % end + % % } +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', 'List of sizes of 10 best visited partitions and corresponding log(ml) values'); fprintf(fid, '\n'); +end + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +% partitionSummary = partitionSummary(find(partitionSummary(:,2)>-1e49),:); +partitionSummary = partitionSummary(logical(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1)) ' ' num2str(partitionSummary(part,2))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', line); fprintf(fid, '\n'); + end +end + +if ~fixedK + + disp(' '); + disp(' '); + disp('Probabilities for number of clusters'); + + if (fid ~= -1) + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', 'Probabilities for number of clusters'); fprintf(fid, '\n'); + end + + npopsTaulu = unique(partitionSummary(:,1)); + len = length(npopsTaulu); + probs = zeros(len,1); + partitionSummary(:,2) = partitionSummary(:,2)-max(partitionSummary(:,2)); + sumtn = sum(exp(partitionSummary(:,2))); + for i=1:len + % npopstn = sum(exp(partitionSummary(find(partitionSummary(:,1)==npopsTaulu(i)),2))); + npopstn = sum(exp(partitionSummary(logical(partitionSummary(:,1)==npopsTaulu(i)),2))); + probs(i) = npopstn / sumtn; + end + for i=1:len + if probs(i)>1e-5 + line = [num2str(npopsTaulu(i)) ' ' num2str(probs(i))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', line); fprintf(fid, '\n'); + end + end + end +end + + +if (fid ~= -1) + fclose(fid); +else + diary off +end + +%-------------------------------------------------------------- +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +% newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) && n=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + +%------------------------------------------------------------------------- + +function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? +% kyseinen rivi on peräisin. Funktio tutkii ensin, ett?montako +% rivi?maksimissaan on peräisin yhdelt?yksilölt? jolloin saadaan +% tietää onko kyseess?haploidi, diploidi jne... Tämän jälkeen funktio +% lisää tyhji?rivej?niille yksilöille, joilta on peräisin vähemmän +% rivej?kuin maksimimäär? +% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin +% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytöss?oleva koodi. +% Tämän jälkeen funktio muuttaa alleelikoodit siten, ett?yhden lokuksen j +% koodit saavat arvoja välill?1,...,noalle(j). + +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +% dataApu = []; +% nollat = []; +% isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + %alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(logical(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + % data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + data(logical(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +nrows = size(data,1); +ncols = size(data,2); +rowsFromInd = zeros(nind,1); +for i=1:nind + rowsFromInd(i) = length(find(data(:,end)==i)); +end +maxRowsFromInd = max(rowsFromInd); +a = -999; +emptyRow = repmat(a, 1, ncols); +lessThanMax = find(rowsFromInd < maxRowsFromInd); +missingRows = maxRowsFromInd*nind - nrows; +data = [data; zeros(missingRows, ncols)]; +pointer = 1; +for ind=lessThanMax' %Käy läpi ne yksilöt, joilta puuttuu rivej? + miss = maxRowsFromInd-rowsFromInd(ind); % Tält?yksilölt?puuttuvien lkm. + for j=1:miss + rowToBeAdded = emptyRow; + rowToBeAdded(end) = ind; + data(nrows+pointer, :) = rowToBeAdded; + pointer = pointer+1; + end +end +data = sortrows(data, ncols); % Sorttaa yksilöiden mukaisesti +newData = data; +rowsFromInd = maxRowsFromInd; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhji? +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end + +%-------------------------------------------------------------------------- + +function popnames = fixPopnames(popnames, ninds) + +if length(popnames) == ninds + for i=1:ninds + if isnumeric(popnames{i}) + popnames{i} = num2str(popnames{i}); + % popnames(i) = num2str(popnames{i}); + end + popnames{i} = cellstr(popnames{i}); + % popnames(i) = cellstr(popnames{i}); + end +end + +%-------------------------------------------------------------------------- +function isRational = isTheLoadedNewDataRational(data) +% The last column of the data must include numbers 1-npops +% If so, isRational = 1, otherwise isRational = 0. +% The row numbers must be larger than 1. +if size(data,1) == 1 + isRational = 0; + display('*** ERROR: Sample size must be larger than one'); + return; +end +last_column = data(:,end); +last_column = sort(last_column); +current = 1; +if last_column(1) ~= current + isRational = 0; + display('*** ERROR: Wrong Indexes in the data'); + return; +end; +lengthcol = length(last_column); +for n = 2:lengthcol + if ~(last_column(n) == current || last_column(n) == current + 1) + %Some population is missing from the last column + isRational = 0; + display('*** ERROR: Missing indexes in the data'); + return; + end; + current = last_column(n); +end; +isRational = 1; + + +% %------------------------------------------------------------------------- +% function isRational = isTheLoadedNewLinkageRational(linkage_data) +% % Each positive element must be unique. +% % If so, isRational = 1, otherwise isRational = 0; +% nonzero = find(linkage_data~=0); +% dif = diff(linkage_data(nonzero)); +% if ~all(dif) +% isRational = 0; return; +% end; +% isRational = 1; + +%-------------------------------------------------------------------------- + +function [sumcounts, counts] = ... + indLociCounts(partition, data, npops, noalle) + +nloci=size(data,2)-1; +% ninds = size(data,1); + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + % havainnotLokuksessa = find(partition==i & data(:,j)>=0); + havainnotLokuksessa = find(ismember(data(:,end),find(partition==i))); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile, indexFile) +%Palauttaa tyhjän, mikäli nimitiedosto ja indeksitiedosto +% eivät olleet yht?pitki? + +popnames = []; +indices = load(indexFile); + +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) && ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +if length(names) ~= length(indices) + disp('The number of population names must be equal to the number of '); + disp('entries in the file specifying indices of the first individuals of '); + disp('each population.'); + return; +end + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = indices(i); +end + + diff --git a/matlab/parallel/parallel.m b/matlab/parallel/parallel.m new file mode 100644 index 0000000..a3379db --- /dev/null +++ b/matlab/parallel/parallel.m @@ -0,0 +1,297 @@ +function parallel(varargin) +% PARALLEL the main function of doing parallel classification/admixture. +% input: order of input is arbitrary. The first option is the default. +% 'datafile' - the full path of the data. +% 'mixturetype', ['mix';'codon_mix';'linear_mix';'spatial';'ad_mix']; +% - the classification/admixture model. +% 'initialk', a row vector of positive integers; +% - the initial number of clusters. +% 'fixedk',['no';'yes']; +% - whether the number of clusters is fixed during the computation. +% 'outputmat' - the full path of the output .mat file. +% 'datatype', ['numeric';'sequence';'matlab';'excel';'genepop'] +% - the data format; +% 'namefile' - the full path of the population name file. +% 'indexfile' - the full path of the index file. +% 'linkagemap' - the full path of the linkage map, needed only for the +% unpreprocessed data under the linkage model. +% 'coordinatefile' - needed with the spatial model. +% 'groups', ['no';'yes'] - clustering of groups instead of individuals. +% 'groupname' - the full path of the group name file. + +% Examples: +% - Linkage model: +% parallel('datafile','e:\baps4\baps_source\data\bpseudomallei.xls',... +% 'mixturetype','codon_mix','initialk','[10 15]','fixedk','no',... +% 'outputmat','E:\test_link.mat','datatype','excel') +% - Independent model: +% parallel('datafile','e:\baps4\baps_source\data\baps_data.txt',... +% 'mixturetype','mix','initialk','[10:15]','fixedk','no',... +% 'outputmat','e:\test_ind.mat','datatype','numeric'); +% - Spatial model: +% parallel('datafile','e:\baps4\baps_source\data\wolverines_spatial_preprocessed.mat',... +% 'mixturetype','spatial','initialk', '[10 11]', 'fixedk','no',... +% 'outputmat','e:\test_spatial.mat','datatype','matlab'); +% +% +% - Admixture model: +% parallel('datafile','e:\baps5\data\data1_mixture.mat', ... +% 'mixturetype','ad_mix', ... +% 'clusters','[1 3 5]',... +% 'iters','2',... +% 'refinds','3',... +% 'refiters','4',... +% 'outputmat','e:\baps5\data\data1_admixture_parallel.mat'); + +% A group of result files can be later compared by using compare.m +% function. +%------------------------------------------------------------------------------- +%- Set up options and default parameters +%------------------------------------------------------------------------------- +msgInvalidPair = '***ERROR: Bad value for argument: ''%s'''; + +% default options +options = struct('dataFile', '',... + 'dataType', 'numeric',... + 'mixtureType', 'mix',... + 'initialK', 1, ... + 'fixedK', 'no', ... + 'nameFile', '', ... + 'indexFile', '', ... + 'outputMat', '', ... + 'linkageMap','', ... + 'coordinateFile', '', ... + 'groups','no', ... + 'groupname', '', ... + 'clusters', '', ... + 'minSize', '', ... + 'iters', '', ... + 'refInds', '', ... + 'refIters', '' ... + ); + +if nargin == 1 && isstruct(varargin{1}) + paramlist = [ fieldnames(varargin{1}) ... + struct2cell(varargin{1}) ]'; + paramlist = { paramlist{:} }; +else + if mod(nargin,2) + error('Invalid parameter/value pair arguments.'); + end + paramlist = varargin; +end + +optionsnames = lower(fieldnames(options)); +for i=1:2:length(paramlist) + pname = paramlist{i}; + pvalue = paramlist{i+1}; + ind = strmatch(lower(pname),optionsnames); + if isempty(ind) + error(['Invalid parameter: ''' pname '''.']); + elseif length(ind) > 1 + error(['Ambiguous parameter: ''' pname '''.']); + end + switch(optionsnames{ind}) + case 'datafile' + if ischar(pvalue) + options.dataFile = pvalue; + else + error(sprintf(msgInvalidPair,pname)); + end +% if ~isempty(findstr(pvalue , '.txt')) +% options.dataType = 'text'; +% elseif ~isempty(findstr(pvalue, '.mat')) +% options.dataType = 'matlab'; +% elseif ~isempty(findstr(pvalue, '.xls')) +% options.dataType = 'excel'; +% else +% error('*** ERROR: unrecognized data format'); +% end + case 'mixturetype' + if ischar(pvalue) + if ~strmatch(pvalue, strvcat('mix','ad_mix','linear_mix','codon_mix','spatical'),'exact') + error('*** ERROR: unrecoganized model type'); + end + if isempty(pvalue), + options.mixtureType = '.'; + else + options.mixtureType = pvalue; + end + else + error(sprintf(msgInvalidPair,pname)); + end + case 'initialk' + pvalue = str2num(pvalue); + if isnumeric(pvalue) + if isempty(pvalue), + options.initialK = 0; + else + options.initialK = pvalue; + end + else + error(sprintf(msgInvalidPair,pname)); + end + case 'fixedk' + if ischar(pvalue) + if isempty(pvalue), + options.fixedK = 'no'; + else + options.fixedK = pvalue; + end + else + error(sprintf(msgInvalidPair,pname)); + end + case 'namefile' + if ischar(pvalue) + options.nameFile = pvalue; + else + error(sprintf(msgInvalidPair,pname)); + end + case 'indexfile' + if ischar(pvalue) + options.indexFile = pvalue; + else + error(sprintf(msgInvalidPair,pname)); + end + case 'outputmat' + if ischar(pvalue) + options.outputMat = pvalue; + directoryName = fileparts(pvalue); + if ~exist(directoryName) + fprintf(1,'*** ERROR: Output directory ''%s'' does not exist.\n', directoryName); + return + end + else + error(sprintf(msgInvalidPair,pname)); + end + case 'datatype' + if ischar(pvalue) + options.dataType = pvalue; + else + error(sprintf(msgInvalidPair,pname)); + end + case 'linkagemap' + if ischar(pvalue) + options.linkageMap = pvalue; + else + error(sprintf(msgInvalidPair,pname)); + end + case 'coordinatefile' + if ischar(pvalue) + options.coordinateFile = pvalue; + else + error(sprintf(msgInvalidPair,pname)); + end + case 'groups' + if ischar(pvalue) + options.groups = pvalue; + else + error(sprintf(msgInvalidPair,pname)); + end + case 'groupname' + if ischar(pvalue) + options.groupname = pvalue; + else + error(sprintf(msgInvalidPair,pname)); + end + + % the options below are for admixture analysis + case 'clusters' + if ischar(pvalue) + options.clusters = str2num(pvalue); + else + error(sprintf(msgInvalidPair,pname)); + end + case 'minsize' + if ischar(pvalue) + options.minSize = str2num(pvalue); + else + error(sprintf(msgInvalidPair,pname)); + end + case 'iters' + if ischar(pvalue) + options.iters = str2num(pvalue); + else + error(sprintf(msgInvalidPair,pname)); + end + case 'refinds' + if ischar(pvalue) + options.refInds = str2num(pvalue); + else + error(sprintf(msgInvalidPair,pname)); + end + case 'refiters' + if ischar(pvalue) + options.refIters = str2num(pvalue); + else + error(sprintf(msgInvalidPair,pname)); + end + otherwise + error(['Invalid parameter: ''' pname '''.']); + end +end + +% The subfunction to check syntax +if ~checkSyntax(options) + return +end + +switch options.mixtureType + case 'mix' + if isequal(options.groups,'yes') + greedyPopMix_parallel(options); + else + independent_parallel(options); + end + case 'linear_mix' + linkage_parallel(options); + case 'codon_mix' + linkage_parallel(options); + case 'spatial' + if isequal(options.groups, 'yes') + spatialPopMixture_parallel(options); + else + spatial_parallel(options); + end + case 'ad_mix' + admix_parallel(options); +end + +% ------------------------------------------------------------------------- +% Subfunctions +% ------------------------------------------------------------------------- +function isOK = checkSyntax(options) +isOK = 1; +if strcmp(options.fixedK, 'yes') && length(options.initialK)>1 + display('*** ERROR: conflicting in options fixedk and initialk.'); + isOK = 0; +end + +if strcmp(options.mixtureType, 'mix') + if strcmp(options.dataType, 'excel') || strcmp(options.dataType,'sequence') + display('*** ERROR: unknown datatype for the independence module.'); + isOK = 0; + end +end + +% check the admixture parameters +admix_str = {options.clusters, options.minSize, options.iters, ... + options.refInds, options.refIters}; +pt = cellfun('isempty', admix_str); + +if all(pt) + if ~strcmp(options.mixtureType, 'ad_mix') + isOK = 1; + else + display('*** ERROR: problematic mixture type.'); + isOK = 0; + end +end + +if any(pt) && strcmp(options.mixtureType, 'ad_mix') + display('*** ERROR: incomplete admixture parameters.'); + isOK = 0; +else + isOK = 1; +end + diff --git a/matlab/parallel/proportion2str.m b/matlab/parallel/proportion2str.m new file mode 100644 index 0000000..9ff5fbf --- /dev/null +++ b/matlab/parallel/proportion2str.m @@ -0,0 +1,18 @@ +function str = proportion2str(prob) +%prob belongs to [0.00, 0.01, ... ,1]. +%str is a 4-mark presentation of proportion. + +if abs(prob)<1e-3 + str = '0.00'; +elseif abs(prob-1) < 1e-3; + str = '1.00'; +else + prob = round(100*prob); + if prob<10 + str = ['0.0' num2str(prob)]; + else + str = ['0.' num2str(prob)]; + end; +end; + +%------------------------------------------------------- \ No newline at end of file diff --git a/matlab/parallel/readScript.m b/matlab/parallel/readScript.m new file mode 100644 index 0000000..ab56333 --- /dev/null +++ b/matlab/parallel/readScript.m @@ -0,0 +1,66 @@ +function readScript(filename) +% READSCRIPT read the script file and output the parameters +% this function does not perform syntax checking. +% Example: +% readScript('script.txt') + +% read the script +ind = readfile(filename); +if isempty(ind) + return +end +nLines = size(ind,1); + +% extract command information +optionStr = []; +for k = 1:nLines + [cmdName, paraStr] = extract(ind(k,:)); + optionStr = [optionStr cmdName ',' paraStr ',']; +end +optionStr = optionStr(1:end-1); % remove the last coma + +% call function parallel +eval(['parallel(' optionStr ')']) + +% ------------------------------------------------------------------------- +% Subfunctions +% ------------------------------------------------------------------------- +function [cmdName, paraStr] = extract(commandline) +% function to extract the command name and the parameter string + +[cmdName, remainStr] = strtok(commandline,'('); +boundary = regexp(remainStr,''''); + +if isempty(boundary) % if paraStr does not contain quotation marks + % use parenthesis as boundaries + startPt = regexp(remainStr,'(') + 1; + endPt = regexp(remainStr,')') - 1; +else + startPt = boundary(1) + 1; + endPt = boundary(2) - 1; +end +paraStr = remainStr(startPt: endPt); + +cmdName = strcat('''',cmdName,''''); +paraStr = strcat('''',paraStr,''''); + +% ------------------------------------------------------------------------- +function T = readfile(filename); +f = fopen(filename,'r'); +if f == -1 + % error(filename); + display('*** ERROR: invalid script name.'); + T = []; + return +end +i = 1; +while 1 + clear line; + line = fgetl(f); + if ~isstr(line), break, end + n = length(line); + T(i,1:n) = line(1:n); + i = i+1; +end +fclose(f); + diff --git a/matlab/parallel/sc.txt b/matlab/parallel/sc.txt new file mode 100644 index 0000000..02886a1 --- /dev/null +++ b/matlab/parallel/sc.txt @@ -0,0 +1,6 @@ +datafile('r:\baps5\data\simple_data.txt'); +mixturetype('mix') +outputmat('r:\baps5\data\simple_mix.mat') +initialk(2 3) +fixedk('no') +datatype('numeric') diff --git a/matlab/parallel/sc2.txt b/matlab/parallel/sc2.txt new file mode 100644 index 0000000..af3cf7e --- /dev/null +++ b/matlab/parallel/sc2.txt @@ -0,0 +1,8 @@ +datafile('e:\baps5\data\ssuis_mixture_results_codon.mat') +mixturetype('ad_mix') +outputmat('e:\baps5\data\ssuis_small_script_2.mat') +clusters(3:4) +minsize(5) +iters(2) +refinds(3) +refiters(4) \ No newline at end of file diff --git a/matlab/parallel/sort_partition.m b/matlab/parallel/sort_partition.m new file mode 100644 index 0000000..03d7019 --- /dev/null +++ b/matlab/parallel/sort_partition.m @@ -0,0 +1,17 @@ +function srt_partition = sort_partition(partition) +% SORT_PARTITION sorts a given partition (row vector) into the canonical order, where every +% new class has the smallest possible index. + +% input: +% partition is a column vector. +% output: +% srt_partition is a row vector. + +n_classes=max(partition); +srt_partition=zeros(1,n_classes); +for i=1:n_classes + nonz=find(partition); + here=find(partition==partition(nonz(1))); + srt_partition(here)=i; + partition(here)=0; +end \ No newline at end of file diff --git a/matlab/parallel/spatialPopMixture_parallel.m b/matlab/parallel/spatialPopMixture_parallel.m new file mode 100644 index 0000000..1876e21 --- /dev/null +++ b/matlab/parallel/spatialPopMixture_parallel.m @@ -0,0 +1,2275 @@ +function spatialPopMixture_parallel(options) +% SPATIALPOPMIXTURE_PARALLEL is the command line version of the group partition with +% spaticial models. +% Input: options is a struct generated by parallel.m + + +%-------------------------------------------------------------------------- +%- Syntax check out +%-------------------------------------------------------------------------- +outp = [options.outputMat '.txt']; +inp = [options.dataFile ' & ' options.coordinateFile]; + +if strcmp(options.fixedK, 'yes') + fixedK = 1; +else + fixedK = 0; +end + +switch options.dataType + +case 'numeric' + data = load(options.dataFile); + ninds = testaaOnkoKunnollinenBapsData(data); %TESTAUS + if (ninds==0) + disp('*** ERROR: Incorrect Data-file.'); + return; + end + + coordinates = load(options.coordinateFile); + viallinen = testaaKoordinaatit(ninds, coordinates); + if viallinen + disp('*** ERROR: Incorrect coordinates.'); + return + end + + if ~isempty(options.groupname) + popnames = initPopNames(options.groupname); + if (size(popnames,1)~=ninds) + disp('*** ERROR: Incorrect name-file.'); + popnames = []; + end + else + popnames = []; + end + + disp('Pre-processing the data. This may take several minutes.'); + + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + [data, rows, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + [Z,dist] = newGetDistances(data,rows); + + rowsFromInd = 0; % Ei tiedet? + +% save_preproc = questdlg('Do you wish to save pre-processed data?',... +% 'Save pre-processed data?',... +% 'Yes','No','Yes'); +% if isequal(save_preproc,'Yes'); +% waitALittle; +% [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); +% kokonimi = [pathname filename]; +% c.data = data; c.rows = rows; c.alleleCodes = alleleCodes; +% c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; +% c.dist = dist; c.popnames = popnames; c.Z = Z; +% c.cliques = cliques; c.separators = separators; +% c.vorPoints = vorPoints; c.rowsFromInd = rowsFromInd; +% c.vorCells = vorCells; c.pointers = pointers; +% c.coordinates = coordinates; +% save(kokonimi,'c'); +% clear c; +% end; + +case 'genepop' + kunnossa = testaaGenePopData(options.dataFile); + if kunnossa==0 + return + end + [data,popnames]=lueGenePopData(options.dataFile); + + ninds = max(data(:,end)); + coordinates = load(options.coordinateFile); + viallinen = testaaKoordinaatit(ninds, coordinates); + + if viallinen + disp('*** ERROR: Incorrect coordinates'); + return + end + + disp('Pre-processing the data. This may take several minutes.'); + + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + [data, rows, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + [Z,dist] = newGetDistances(data,rows); + + rowsFromInd = 2; %Tiedetään + +% save_preproc = questdlg('Do you wish to save pre-processed data?',... +% 'Save pre-processed data?',... +% 'Yes','No','Yes'); +% if isequal(save_preproc,'Yes'); +% waitALittle; +% [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); +% kokonimi = [pathname filename]; +% c.data = data; c.rows = rows; c.alleleCodes = alleleCodes; +% c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; +% c.dist = dist; c.popnames = popnames; c.Z = Z; +% c.cliques = cliques; c.separators = separators; +% c.vorPoints = vorPoints; c.rowsFromInd = rowsFromInd; +% c.vorCells = vorCells; c.pointers = pointers; +% c.coordinates = coordinates; +% save(kokonimi,'c'); +% clear c; +% end; + +case 'matlab' + struct_array = load(options.dataFile); + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'dist') + disp('*** ERROR: Incorrect file format'); + return + end + elseif isfield(struct_array,'dist') %Mideva versio + c = struct_array; + else + disp('*** ERROR: Incorrect file format'); + return; + end + data = double(c.data); rows = c.rows; alleleCodes = c.alleleCodes; + noalle = c.noalle; adjprior = c.adjprior; priorTerm = c.priorTerm; + dist = c.dist; popnames = c.popnames; Z = c.Z; rowsFromInd = c.rowsFromInd; + + if isfield(c, 'cliques') + cliques = c.cliques; separators = c.separators; + vorPoints = c.vorPoints; vorCells = c.vorCells; + pointers = c.pointers; coordinates = c.coordinates; + clear c; + else + ninds = max(data(:,end)); + coordinates = load(options.coordinateFile); + viallinen = testaaKoordinaatit(ninds, coordinates); + + if viallinen + disp('*** ERROR: Incorrect coordinates'); + return + end + + disp('Pre-processing the data. This may take several minutes.'); + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); +% save_preproc = questdlg('Do you wish to save pre-processed data?',... +% 'Save pre-processed data?',... +% 'Yes','No','Yes'); +% if isequal(save_preproc,'Yes'); +% waitALittle; +% [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); +% kokonimi = [pathname filename]; +% c.cliques = cliques; c.separators = separators; +% c.vorPoints = vorPoints; c.vorCells = vorCells; +% c.pointers = pointers; c.coordinates = coordinates; +% save(kokonimi,'c'); +% clear c; +% end; + end +end + +global PARTITION; global COUNTS; +global SUMCOUNTS; %global POP_LOGML; +global SEPCOUNTS; global CLIQCOUNTS; +clearGlobalVars; +npopstext = []; +npopstextExtra = options.initialK; +if length(npopstextExtra)>=255 + npopstextExtra = npopstextExtra(1:255); + npopstext = [npopstext ' ' npopstextExtra]; + teksti = 'The input field length limit (255 characters) was reached. Input more values: '; +else + if max(npopstextExtra) > size(data,1) + error('Initial K larger than the sample size are not accepted. '); + else + npopstext = [npopstext ' ' num2str(npopstextExtra)]; + end +end +clear teksti; + +if isempty(npopstext) || length(npopstext)==1 + return +else + npopsTaulu = str2num(npopstext); + ykkoset = find(npopsTaulu==1); + npopsTaulu(ykkoset) = []; % Mikäli ykkösi?annettu ylärajaksi, ne poistetaan. + if isempty(npopsTaulu) + return + end + clear ykkoset; +end + +if fixedK + % Only the first value of npopsTaulu is used + npops = npopsTaulu(1); + nruns = length(npopsTaulu); + [logml, npops, partitionSummary]=spatialMix_fixK(c,npops,nruns); +else + [logml, npops, partitionSummary]=spatialMix(c,npopsTaulu); +end + +if logml==1 + return +end + +data = noIndex(data,noalle); + +h0 = findobj('Tag','filename1_text'); inp = get(h0,'String'); +h0 = findobj('Tag','filename2_text'); +outp = get(h0,'String'); +[varmuus,changesInLogml] = writeMixtureInfo(logml, rows, data, adjprior, priorTerm, ... + outp,inp,partitionSummary, popnames, cliques, separators, fixedK); + +%checkLogml(priorTerm, adjprior, cliques, separators); + + +viewPopMixPartition(PARTITION, rows, popnames); + +if isequal(popnames, []) + names = pointers; +else + %Etsitään voronoi-soluja vastaavat nimet. + names = cell(size(pointers)); + indices = 1:length(popnames); + for i = 1:length(pointers) + inds = pointers{i}; + namesInCell = []; + for j = 1:length(inds) + ind = inds(j); + I = find(indices > ind); + if isempty(I) + nameIndex = indices(end); + else + nameIndex = min(I) -1; + end + name = popnames{nameIndex}; + namesInCell = [namesInCell name]; + end + names{i} = namesInCell; + end +end +vorPlot(vorPoints, vorCells, PARTITION, pointers, coordinates, names); + + +if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',[pathname filename '.txt']) + delete('baps4_output.baps') +end + + +if rowsFromInd==0 + %Käytettiin BAPS-formaattia, eik?rowsFromInd ole tunnettu. + [popnames, rowsFromInd] = findOutRowsFromInd(popnames, rows); +end + +groupPartition = PARTITION; + +fiksaaPartitioYksiloTasolle(rows, rowsFromInd); + +c.PARTITION = PARTITION; c.COUNTS = COUNTS; c.SUMCOUNTS = SUMCOUNTS; +c.alleleCodes = alleleCodes; c.adjprior = adjprior; c.popnames = popnames; +c.rowsFromInd = rowsFromInd; c.data = data; c.npops = npops; +c.noalle = noalle; c.groupPartition = groupPartition; +c.pointers = pointers; c.vorPoints = vorPoints; c.vorCells = vorCells; +c.coordinates = coordinates; c.names = names; c.varmuus = varmuus; +c.rows = rows; c.mixtureType = 'spatialPop'; c.changesInLogml = changesInLogml; +fprintf(1,'Saving the result...') +try +% save(options.outputMat, 'c'); + save(options.outputMat, 'c', '-v7.3'); % added by Lu Cheng, 08.06.2012 + fprintf(1,'Finished.\n'); +catch + display('*** ERROR in saving the result.'); +end + + +% ------------------------------------------------------------------------- +% - Subfunctions +% ------------------------------------------------------------------------- + + +%-------------------------------------------------------------------------- +%-------------------------------------------------------------------------- + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +%global POP_LOGML; POP_LOGML = []; +global SEPCOUNTS; SEPCOUNTS = []; +global CLIQCOUNTS; CLIQCOUNTS = []; + +%------------------------------------------------------------------------------------- + + +function rows = computeRows(rowsFromInd, inds, ninds) +% On annettu yksilöt inds. Funktio palauttaa vektorin, joka +% sisältää niiden rivien numerot, jotka sisältävät yksilöiden +% dataa. + +rows = inds(:, ones(1,rowsFromInd)); +rows = rows*rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [ninds 1]); +rows = rows - miinus; +rows = reshape(rows', [1,rowsFromInd*ninds]); + + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + global PARTITION; + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + + +%-------------------------------------------------------------------------- + + +function [suurin, i2] = arvoSeuraavaTila(muutokset, logml) +% Suorittaa yksilön seuraavan tilan arvonnan + +y = logml + muutokset; % siirron jälkeiset logml:t +y = y - max(y); +y = exp(y); +summa = sum(y); +y = y/summa; +y = cumsum(y); + +i2 = rand_disc(y); % uusi kori +suurin = muutokset(i2); + + +%-------------------------------------------------------------------------------------- + + +function svar=rand_disc(CDF) +%returns an index of a value from a discrete distribution using inversion method +slump=rand; +har=find(CDF>slump); +svar=har(1); + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, diffInCounts, ... + cliques, separators, adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun yksil?ind +% siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global CLIQCOUNTS; +global SEPCOUNTS; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); +diffInSepCounts = computeDiffInCliqCounts(separators, ind); + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + +%POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2(i1, i2, diffInCounts, ... + cliques, separators, adjprior, priorTerm); +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksilöt siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +%global POP_LOGML; +global CLIQCOUNTS; +global SEPCOUNTS; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +diffInCliqCounts = CLIQCOUNTS(:,i1); +diffInSepCounts = SEPCOUNTS(:,i1); + + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = 0; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = 0; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, i2, cliques, separators); +% Suorittaa globaalien muuttujien päivitykset, kun yksilöt 'muuttuvat' +% siirretään koriin i2. Ennen siirtoa yksilöiden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; global CLIQCOUNTS; +global SUMCOUNTS; global SEPCOUNTS; +%global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +diffInCliqCounts = computeDiffInCliqCounts(cliques, muuttuvat); +diffInSepCounts = computeDiffInCliqCounts(separators, muuttuvat); + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + +%POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, globalRows, data, ... + adjprior, priorTerm) +% Palauttaa yksilöt järjestyksess?siten, ett?ensimmäisen?on +% se, jonka poistaminen populaatiosta pop nostaisi logml:n +% arvoa eniten. + +global COUNTS; global SUMCOUNTS; +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind =inds(i); + rows = globalRows(i,1):globalRows(i,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop) = COUNTS(:,:,pop)-diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)-diffInSumCounts; + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior, priorTerm); + COUNTS(:,:,pop) = COUNTS(:,:,pop)+diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)+diffInSumCounts; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset(ind, globalRows, ... + data, adjprior, priorTerm, logml, cliques, separators) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksilöt inds siirretään koriin i. +% diffInCounts on poistettava COUNTS:in siivusta i1 ja lisättäv? +% COUNTS:in siivuun i2, mikäli muutos toteutetaan. +% Huom! Laskee muutoksen vain yhdelle tyhjälle populaatiolle, muiille +% tyhjille tulee muutokseksi 0. + +global COUNTS; global SUMCOUNTS; +global PARTITION; %global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +counts = COUNTS; +sumcounts = SUMCOUNTS; + +[emptyPop, pops] = findEmptyPop(npops); + +i1 = PARTITION(ind); + +i2 = [pops(find(pops~=i1))]; +if emptyPop > 0 + i2 =[i2 emptyPop]; +end + +i2 = sort(i2); + +rows = globalRows(ind,1):globalRows(ind,2); +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); +diffInSepCounts = computeDiffInCliqCounts(separators, ind); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; + +for i=i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; +end + +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) + diffInSepCounts; + +% Asetetaan muillekin tyhjille populaatioille sama muutos, kuin +% emptyPop:lle + +if emptyPop > 0 + empties = mysetdiff((1:npops), [i2 i1]); + muutokset(empties) = muutokset(emptyPop); +end + +COUNTS = counts; +SUMCOUNTS = sumcounts; + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2(i1, globalRows, ... + data, adjprior, priorTerm, logml, cliques, separators); +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli korin i1 kaikki yksilöt siirretään +% koriin i. +% Laskee muutokset vain yhdelle tyhjälle populaatiolle, muille tulee +% muutokseksi 0. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +[emptyPop, pops] = findEmptyPop(npops); + +i2 = [pops(find(pops~=i1))]; +if emptyPop > 0 + i2 =[i2 emptyPop]; +end + +inds = find(PARTITION == i1); +ninds = length(inds); + +rows = []; +for i = 1:ninds + rows = [rows globalRows(inds(i),1):globalRows(inds(i),2)]; +end +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); +diffInCliqCounts = computeDiffInCliqCounts(cliques, inds); +diffInSepCounts = computeDiffInCliqCounts(separators, inds); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +CLIQCOUNTS(:,i1) = 0; +SEPCOUNTS(:,i1) = 0; + +for i=i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; +end + +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; +CLIQCOUNTS(:,i1) = diffInCliqCounts; +SEPCOUNTS(:,i1) = diffInSepCounts; + + + +%------------------------------------------------------------------------------------ + +function muutokset = laskeMuutokset3(T2, inds2, globalRows, ... + data, adjprior, priorTerm, i1, logml, cliques, separators) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mik?olisi muutos logml:ss? jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirretään koriin j. +% Laskee vain yhden tyhjän populaation, muita kohden muutokseksi jää 0. + + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = []; + for i = 1:ninds + ind = inds(i); + rows = [rows; (globalRows(ind,1):globalRows(ind,2))']; + end + diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + diffInCliqCounts = computeDiffInCliqCounts(cliques, inds); + diffInSepCounts = computeDiffInCliqCounts(separators, inds); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; + SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; + + [emptyPop, pops] = findEmptyPop(npops); + i2 = [pops(find(pops~=i1))]; + if emptyPop > 0 + i2 =[i2 emptyPop]; + end + + for i = i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(pop2,i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; + end + + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) + diffInCliqCounts; + SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) + diffInSepCounts; + end +end + +%-------------------------------------------------------------------------- +function muutokset = laskeMuutokset5(inds, globalRows, data, ... + adjprior, priorTerm, logml, cliques, separators, i1, i2) + +% Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?i vaihtaisi koria i1:n ja i2:n välill? + +global COUNTS; global SUMCOUNTS; +global PARTITION; +global CLIQCOUNTS; global SEPCOUNTS; + +ninds = length(inds); +muutokset = zeros(ninds,1); + +for i = 1:ninds + ind = inds(i); + + rows = globalRows(ind,1):globalRows(ind,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + if PARTITION(ind)==i1 + pop1 = i1; %mist? + pop2 = i2; %mihin + else + pop1 = i2; + pop2 = i1; + end + + diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); + diffInSepCounts = computeDiffInCliqCounts(separators, ind); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)-diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts; + + CLIQCOUNTS(:,pop1) = CLIQCOUNTS(:,pop1) - diffInCliqCounts; + CLIQCOUNTS(:,pop2) = CLIQCOUNTS(:,pop2) + diffInCliqCounts; + SEPCOUNTS(:,pop1) = SEPCOUNTS(:,pop1) - diffInSepCounts; + SEPCOUNTS(:,pop2) = SEPCOUNTS(:,pop2) + diffInSepCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; + + CLIQCOUNTS(:,pop1) = CLIQCOUNTS(:,pop1) + diffInCliqCounts; + CLIQCOUNTS(:,pop2) = CLIQCOUNTS(:,pop2) - diffInCliqCounts; + SEPCOUNTS(:,pop1) = SEPCOUNTS(:,pop1) + diffInSepCounts; + SEPCOUNTS(:,pop2) = SEPCOUNTS(:,pop2) - diffInSepCounts; +end + +%-------------------------------------------------------------------------- + +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n +% riveill?rows. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + + + +%------------------------------------------------------------------------------------ + + +function popLogml = computePopulationLogml(pops, adjprior, priorTerm) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS(pops,:)),2) - priorTerm; + +%------------------------------------------------------------------------------------ + +function npops = poistaTyhjatPopulaatiot(npops) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; +global CLIQCOUNTS; +global SEPCOUNTS; + +notEmpty = find(any(SUMCOUNTS,2)); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); +CLIQCOUNTS = CLIQCOUNTS(:,notEmpty); +SEPCOUNTS = SEPCOUNTS(:,notEmpty); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%---------------------------------------------------------------------------------- +%Seuraavat kolme funktiota liittyvat alkupartition muodostamiseen. + +function initial_partition=admixture_initialization(data_matrix,nclusters,Z) +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +T=cluster_own(Z,nclusters); +initial_partition=zeros(size_data(1),1); +for i=1:n + kori=T(i); + here=find(data_matrix(:,end)==i); + for j=1:length(here) + initial_partition(here(j),1)=kori; + end +end + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + + +%--------------------------------------------------------------------------------------- + + +function [newData, rows, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? +% kyseinen rivi on peräisin. Funktio tutkii ensin, ett?montako +% rivi?maksimissaan on peräisin yhdelt?yksilölt? jolloin saadaan +% tietää onko kyseess?haploidi, diploidi jne... Tämän jälkeen funktio +% lisää tyhji?rivej?niille yksilöille, joilta on peräisin vähemmän +% rivej?kuin maksimimäär? +% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin +% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytöss?oleva koodi. +% Tämän jälkeen funktio muuttaa alleelikoodit siten, ett?yhden lokuksen j +% koodit saavat arvoja välill?1,...,noalle(j). +% +% Muutettu vastaamaan greedyPopMixin handlePopDataa. + +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +dataApu = []; nollat = []; isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +%rows = cell(nind,1); +rows = zeros(nind,2); +for i=1:nind + rivit = find(data(:,end)==i)'; + rows(i,1) = min(rivit); + rows(i,2) = max(rivit); +end +newData = data; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + + +%---------------------------------------------------------------------------------------- + +function [Z, dist] = newGetDistances(data, initRows) + +ninds = size(initRows,1); +nloci = size(data,2)-1; +riviLkm = nchoosek(ninds,2); + +empties = find(data<0); +data(empties)=0; +data = uint8(data); % max(noalle) oltava <256 + +pariTaulu = zeros(riviLkm,2); +aPointer=1; +for a=1:ninds-1 + pariTaulu(aPointer:aPointer+ninds-1-a,1) = ones(ninds-a,1)*a; + pariTaulu(aPointer:aPointer+ninds-1-a,2) = (a+1:ninds)'; + aPointer = aPointer+ninds-a; +end + +%eka = pariTaulu(:,ones(1,rowsFromInd)); +%eka = eka * rowsFromInd; +%miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]); +%eka = eka - miinus; + +koot = initRows(:,2) - initRows(:,1); +maxSize = max(koot) + 1; + +rows = zeros(ninds, maxSize); + +for i=1:ninds + apu = initRows(i,1):initRows(i,2); + rows(i, 1:length(apu)) = apu; +end +eka = zeros(riviLkm, maxSize); +toka = zeros(riviLkm, maxSize); + +for i = 1:riviLkm + eka(i, :) = rows(pariTaulu(i, 1), :); + toka(i, :) = rows(pariTaulu(i,2), :); +end + +%eka = uint16(eka); +%toka = uint16(toka); + +summa = zeros(riviLkm,1); +vertailuja = zeros(riviLkm,1); + +clear pariTaulu; clear miinus; + +x = zeros(size(eka)); x = uint8(x); +y = zeros(size(toka)); y = uint8(y); + +for j=1:nloci; + + for k=1:maxSize + I = find(eka(:,k)>0); + x(I,k) = data(eka(I,k),j); + I = find(toka(:,k)>0); + y(I,k) = data(toka(I,k),j); + end + + for a=1:maxSize + for b=1:maxSize + vertailutNyt = double(x(:,a)>0 & y(:,b)>0); + vertailuja = vertailuja + vertailutNyt; + lisays = (x(:,a)~=y(:,b) & vertailutNyt); + summa = summa+double(lisays); + end + end +end + +clear x; clear y; clear vertailutNyt; +nollat = find(vertailuja==0); +dist = zeros(length(vertailuja),1); +dist(nollat) = 1; +muut = find(vertailuja>0); +dist(muut) = summa(muut)./vertailuja(muut); +clear summa; clear vertailuja; + +Z = linkage(dist'); + +%---------------------------------------------------------------------------------------- + + +function [Z, distances]=getDistances(data_matrix,nclusters) + +%finds initial admixture clustering solution with nclusters clusters, uses simple mean Hamming distance +%gives partition in 8-bit format +%allocates all alleles of a single individual into the same basket +%data_matrix contains #Loci+1 columns, last column indicate whose alleles are placed in each row, +%i.e. ranges from 1 to #individuals. For diploids there are 2 rows per individual, for haploids only a single row +%missing values are indicated by zeros in the partition and by negative integers in the data_matrix. + +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +distances=zeros(nchoosek(n,2),1); +pointer=1; +for i=1:n-1 + i_data=data_matrix(find(data_matrix(:,end)==i),1:nloci); + for j=i+1:n + d_ij=0; + j_data=data_matrix(find(data_matrix(:,end)==j),1:nloci); + vertailuja = 0; + for k=1:size(i_data,1) + for l=1:size(j_data,1) + here_i=find(i_data(k,:)>=0); + here_j=find(j_data(l,:)>=0); + here_joint=intersect(here_i,here_j); + vertailuja = vertailuja + length(here_joint); + d_ij = d_ij + length(find(i_data(k,here_joint)~=j_data(l,here_joint))); + end + end + d_ij = d_ij / vertailuja; + distances(pointer)=d_ij; + pointer=pointer+1; + end +end + +Z=linkage(distances'); + + + +%---------------------------------------------------------------------------------------- + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile) + +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) & ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = 0; +end + + +%----------------------------------------------------------------------------------- +% Laskee arvot cliqcounts:lle ja sepcounts:lle + +function [cliqcounts, sepcounts] = computeCounts(cliques, separators, npops) + +global PARTITION; +ncliq = size(cliques,1); +nsep = size(separators,1); + +cliqPartition = zeros(ncliq, size(cliques,2)); +sepPartition = zeros(nsep, size(separators, 2)); + +apuCliq = find(cliques > 0); +apuSep = find(separators > 0); + +cliqPartition(apuCliq) = PARTITION(cliques(apuCliq)); +sepPartition(apuSep) = PARTITION(separators(apuSep)); + + +cliqcounts = zeros(ncliq, npops); +for i = 1:npops + cliqcounts(:,i) = sum(cliqPartition == i, 2); +end + + +sepcounts = zeros(nsep, npops); +for i = 1:npops + sepcounts(:,i) = sum(sepPartition == i, 2); +end + +%------------------------------------------------------------------------- + +function diffInCliqCounts = computeDiffInCliqCounts(cliques, inds) +% Laskee muutoksen CLIQCOUNTS:ssa (tai SEPCOUNTS:ssa, jos syötteen? +% separators) kun yksilöt inds siirretään. +% diffInCliqcounts on ncliq*1 taulu, joka on CLIQCOUNTS:n sarakkeesta josta +% yksilöt inds siirretään ja lisättäv?sarakkeeseen, johon yksilöt +% siirretään. + +ncliq = size(cliques,1); +diffInCliqCounts = zeros(ncliq,1); +ninds = length(inds); +for i = 1:ninds + ind = inds(i); + rivit = sum((cliques == ind),2); + diffInCliqCounts = diffInCliqCounts + rivit; +end + +%----------------------------------------------------------------------- + +function [logml, spatialPrior] = computeLogml(adjprior,priorTerm) + +%global GAMMA_LN; +global CLIQCOUNTS; +global SEPCOUNTS; +global PARTITION; + +notEmpty = any(CLIQCOUNTS); +npops = length(find(notEmpty == 1)); +sumcliq=sum(CLIQCOUNTS, 2); +sumsep=sum(SEPCOUNTS, 2); +ncliq = size(CLIQCOUNTS, 1); +nsep = size(SEPCOUNTS, 1); + +cliqsizes = sum(CLIQCOUNTS, 2)'; +sepsizes = sum(SEPCOUNTS, 2)'; +cliqsizes = min([cliqsizes; npops*ones(1,ncliq)])'; +sepsizes = min([sepsizes; npops*ones(1,nsep)])'; + +klikkitn = sum(sum(gammaln(CLIQCOUNTS(:,notEmpty) + repmat(1./cliqsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./cliqsizes))) ... + - sum(gammaln(sumcliq + 1)); + +septn = sum(sum(gammaln(SEPCOUNTS(:,notEmpty) + repmat(1./sepsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./sepsizes))) ... + - sum(gammaln(sumsep + 1)); + + +%klikkitn = sum(sum(gammaln(CLIQCOUNTS + 1/npops))) ... +% - ncliq*npops*(gammaln(1/npops)) ... +% - sum(gammaln(sumcliq + 1)); +%septn = sum(sum(gammaln(SEPCOUNTS + 1/npops))) ... +% - nsep*npops*(gammaln(1/npops)) ... +% - sum(gammaln(sumsep + 1)); + +spatialPrior = (klikkitn - septn); + +%if spatialPrior > 0 +% keyboard +%end + + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = size(COUNTS,3); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 z]) + COUNTS) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS),2) - priorTerm; + +logml = sum(popLogml) + spatialPrior; + +%-------------------------------------------------------------------------- + + +function initializeGammaln(ninds, rowsFromInd, maxSize) +%Alustaa GAMMALN muuttujan s.e. GAMMALN(i,j)=gammaln((i-1) + 1/j) +global GAMMA_LN; +GAMMA_LN = zeros((1+ninds)*rowsFromInd, maxSize); +for i=1:(ninds+1)*rowsFromInd + for j=1:maxSize + GAMMA_LN(i,j)=gammaln((i-1) + 1/j); + end +end + + +%---------------------------------------------------------------------------- + + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäär? + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + + +%---------------------------------------------------------------------------- + + + +function kunnossa = testaaGenePopData(tiedostonNimi) +% kunnossa == 0, jos data ei ole kelvollinen genePop data. +% Muussa tapauksessa kunnossa == 1. + +kunnossa = 0; +fid = fopen(tiedostonNimi); +line1 = fgetl(fid); %ensimmäinen rivi +line2 = fgetl(fid); %toinen rivi +line3 = fgetl(fid); %kolmas + +if (isequal(line1,-1) | isequal(line2,-1) | isequal(line3,-1)) + disp('Incorrect file format 1168'); fclose(fid); + return +end +if (testaaPop(line1)==1 | testaaPop(line2)==1) + disp('Incorrect file format 1172'); fclose(fid); + return +end +if testaaPop(line3)==1 + %2 rivi tällöin lokusrivi + nloci = rivinSisaltamienMjonojenLkm(line2); + line4 = fgetl(fid); + if isequal(line4,-1) + disp('Incorrect file format 1180'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin nelj?täytyy sisältää pilkku. + disp('Incorrect file format 1185'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy + pointer = pointer+1; + end + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1195'); fclose(fid); + return + end +else + line = fgetl(fid); + lineNumb = 4; + while (testaaPop(line)~=1 & ~isequal(line,-1)) + line = fgetl(fid); + lineNumb = lineNumb+1; + end + if isequal(line,-1) + disp('Incorrect file format 1206'); fclose(fid); + return + end + nloci = lineNumb-2; + line4 = fgetl(fid); %Eka rivi pop sanan jälkeen + if isequal(line4,-1) + disp('Incorrect file format 1212'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin täytyy sisältää pilkku. + disp('Incorrect file format 1217'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy. + pointer = pointer+1; + end + + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1228'); fclose(fid); + return + end +end +kunnossa = 1; +fclose(fid); + +%------------------------------------------------------ + + +function [data, popnames] = lueGenePopData(tiedostonNimi) + +fid = fopen(tiedostonNimi); +line = fgetl(fid); %ensimmäinen rivi +line = fgetl(fid); %toinen rivi +count = rivinSisaltamienMjonojenLkm(line); + +line = fgetl(fid); +lokusRiveja = 1; +while (testaaPop(line)==0) + lokusRiveja = lokusRiveja+1; + line = fgetl(fid); +end + +if lokusRiveja>1 + nloci = lokusRiveja; +else + nloci = count; +end + +popnames = cell(10,2); +data = zeros(100, nloci+1); +nimienLkm=0; +ninds=0; +poimiNimi=1; +digitFormat = -1; +while line ~= -1 + line = fgetl(fid); + + if poimiNimi==1 + %Edellinen rivi oli 'pop' + nimienLkm = nimienLkm+1; + ninds = ninds+1; + if nimienLkm>size(popnames,1); + popnames = [popnames; cell(10,2)]; + end + nimi = lueNimi(line); + if digitFormat == -1 + digitFormat = selvitaDigitFormat(line); + divider = 10^digitFormat; + end + popnames{nimienLkm, 1} = {nimi}; %Näin se on greedyMix:issäkin?!? + popnames{nimienLkm, 2} = ninds; + poimiNimi=0; + + data = addAlleles(data, ninds, line, divider); + + elseif testaaPop(line) + poimiNimi = 1; + + elseif line ~= -1 + ninds = ninds+1; + data = addAlleles(data, ninds, line, divider); + end +end + +data = data(1:ninds*2,:); +popnames = popnames(1:nimienLkm,:); +fclose(fid); + +npops = size(popnames,1); +ind = 1; +for pop = 1:npops + if pop0 + fid = fopen(outPutFile,'a'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF GROUP LEVEL MIXTURE ANALYSIS:'); +disp(['Data file: ' inputFile]); +disp(['Number of clustered groups: ' ownNum2Str(ninds)]); +disp(['Number of clusters in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%s \n', ['RESULTS OF GROUP LEVEL MIXTURE ANALYSIS:']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Data file: ' inputFile]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clustered groups: ' ownNum2Str(ninds)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clusters in optimal partition: ' ownNum2Str(npops)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); fprintf(fid,'\n'); +end + +cluster_count = length(unique(PARTITION)); +disp(['Best Partition: ']); +if (fid ~= -1) + fprintf(fid,'%s \n',['Best Partition: ']); fprintf(fid,'\n'); +end +for m=1:cluster_count + indsInM = find(PARTITION==m); + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + if names + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + else + text = ['Cluster ' num2str(m) ': {' num2str(indsInM(1))]; + for k = 2:cluster_size + text = [text ', ' num2str(indsInM(k))]; + end; + end + text = [text '}']; + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',[new_line]); + fprintf(fid,'\n'); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',[text]); + fprintf(fid,'\n'); + end + end +end + +disp(' '); +disp(' '); +disp('Changes in log(marginal likelihood) if group i is moved to cluster j:'); +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Changes in log(marginal likelihood) if group i is moved to cluster j:']); fprintf(fid, '\n'); +end + +if names + nameSizes = zeros(ninds,1); + for i = 1:ninds + nimi = char(popnames{i}); + nameSizes(i) = length(nimi); + end + maxSize = max(nameSizes); + maxSize = max(maxSize, 5); + erotus = maxSize - 5; + alku = blanks(erotus); + ekarivi = [alku 'group' blanks(6+erotus)]; +else + ekarivi = 'group '; +end + +for i = 1:cluster_count + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; +end +disp(ekarivi); +if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); fprintf(fid, '\n'); +end + +varmuus = zeros(ninds,1); +changesInLogml = LOGDIFF'; +for ind = 1:ninds + %[muutokset, diffInCounts] = laskeMuutokset(ind, globalRows, data, ... + % adjprior, priorTerm, logml, cliques, separators); + muutokset = changesInLogml(:,ind); + if sum(exp(muutokset))>0 + varmuus(ind) = 1 - 1/sum(exp(muutokset)); + else + varmuus(ind) = 0; + end + if names + nimi = char(popnames{ind}); + rivi = [blanks(maxSize - length(nimi)) nimi ':']; + else + rivi = [blanks(4-floor(log10(ind))) ownNum2Str(ind) ':']; + end + for j = 1:npops + rivi = [rivi ' ' logml2String(omaRound(muutokset(j)))]; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + end +end + +disp(' '); disp(' '); +disp('KL-divergence matrix:'); +dist_mat = zeros(npops, npops); +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['KL-divergence matrix in PHYLIP format:']); %fprintf(fid, '\n'); +end + +maxnoalle = size(COUNTS,1); +nloci = size(COUNTS,2); +d = zeros(maxnoalle, nloci, npops); +prior = adjprior; +prior(find(prior==1))=0; +nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. +prior(1,nollia)=1; +for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); +end +% ekarivi = blanks(7); +% for pop = 1:npops +% ekarivi = [ekarivi num2str(pop) blanks(7-floor(log10(pop)))]; +% end +ekarivi = num2str(npops); +disp(ekarivi); +if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); %fprintf(fid, '\n'); +end + +for pop1 = 1:npops + rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + % rivi = [rivi kldiv2str(div) ' ']; + dist_mat(pop1,pop2) = div; + end +% disp(rivi); +% if (fid ~= -1) +% fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); +% end +end + +dist_mat = dist_mat + dist_mat'; % make it symmetric +for pop1 = 1:npops + rivi = ['Cluster_' num2str(pop1) ' ']; + for pop2 = 1:npops + rivi = [rivi kldiv2str(dist_mat(pop1,pop2)) ' ']; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); %fprintf(fid, '\n'); + end +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['List of sizes of 10 best visited partitions and corresponding log(ml) values']); fprintf(fid, '\n'); +end + +partitionSummaryKaikki = partitionSummary; +partitionSummary =[]; +for i=1:size(partitionSummaryKaikki,3) + partitionSummary = [partitionSummary; partitionSummaryKaikki(:,:,i)]; +end +[I,J] = find(partitionSummaryKaikki(:,2,:)>-1e49); +partitionSummaryKaikki = partitionSummaryKaikki(I,:,:); +%keyboard + + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +partitionSummary = partitionSummary(find(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1)) ' ' num2str(partitionSummary(part,2))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end +end + +if ~fixedK + + disp(' '); + disp(' '); + disp('Probabilities for number of clusters'); + + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Probabilities for number of clusters']); fprintf(fid, '\n'); + end + + npopsTaulu = unique(partitionSummary(:,1)); + len = length(npopsTaulu); + probs = zeros(len,1); + partitionSummary(:,2) = partitionSummary(:,2)-max(partitionSummary(:,2)); + sumtn = sum(exp(partitionSummary(:,2))); + for i=1:len + npopstn = sum(exp(partitionSummary(find(partitionSummary(:,1)==npopsTaulu(i)),2))); + probs(i) = npopstn / sumtn; + end + for i=1:len + if probs(i)>1e-5 + line = [num2str(npopsTaulu(i)) ' ' num2str(probs(i))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end + end + end +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); + +%-------------------------------------------------------------------- + + +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) & n=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + + +%-------------------------------------------------------------------------- + + +function ninds = testaaOnkoKunnollinenBapsData(data) +%Tarkastaa onko viimeisess?sarakkeessa kaikki +%luvut 1,2,...,n johonkin n:ään asti. +%Tarkastaa lisäksi, ett?on vähintään 2 saraketta. +if size(data,1)<2 + ninds = 0; return; +end +lastCol = data(:,end); +ninds = max(lastCol); +if ~isequal((1:ninds)',unique(lastCol)) + ninds = 0; return; +end + + +%-------------------------------------------------------------------------- + +function [cliques, separators, vorPoints, vorCells, pointers] ... + = handleCoords(coordinates); +%Laskee yksilöiden luonnolliset naapurit koordinaateista. +%Naapurit lasketaan lisäämäll?koordinaatteihin pisteit? +%jotta kutakin yksilöä vastaisi rajoitettu voronoi-solu +%Puuttuvat koordinaatit (negatiiviset) tulevat erakkopisteiksi +% +%Määrittää lisäksi yksilöit?vastaavat voronoi tesselaation solut. +%vorPoints:ssa on solujen kulmapisteet ja vorCells:ss?kunkin solun +%kulmapisteiden indeksit. Pointers{i} sisältää solussa i olevien yksilöiden +%indeksit. + + + +ninds = length(coordinates); +[I,J] = find(coordinates>0 | coordinates <0); %Käsitellään vain yksilöit? joilta koordinaatit +I = unique(I); %olemassa +ncoords = length(I); +puuttuvat = setdiff(1:ninds, I); +new_coordinates = addPoints(coordinates(I,:)); %Ympäröidään yksilöt apupisteill? + + +apuData = [new_coordinates(1:ncoords,:) (1:ncoords)']; +apuData = sortrows(apuData,[1 2]); +erot = [diff(apuData(:,1)) diff(apuData(:,2))]; +empties = find(erot(:,1)==0 & erot(:,2)==0); +samat = cell(length(empties),1); +pointer = 0; + +for i = 1:length(empties) + if i == 1 | empties(i) - empties(i-1) > 1 %Tutkitaan onko eri pisteess?kuin edellinen + pointer = pointer+1; + samat{pointer} = [apuData(empties(i),3) apuData(empties(i)+1,3)]; + else + samat{pointer} = [samat{pointer} apuData(empties(i)+1,3)]; + end +end + +samat = samat(1:pointer); + +erot = []; apuData = []; empties = []; + +tri = delaunay(new_coordinates(:,1), new_coordinates(:,2), {'Qt','Qbb','Qc','Qz'}); %Apupisteiden takia ok. +%[rivi,sarake] = find(tri>ncoords); %Jätetään huomiotta apupisteet +%tri(rivi,:) = []; +pituus = tri(:,1); +pituus = length(pituus); +parit = zeros(6*pituus,2); +for i = 1:pituus %Muodostetaan kolmikoista parit + j = 6*(i-1)+1; + parit(j,:) = tri(i,1:2); + parit(j+1,:) = tri(i,1:2:3); + parit(j+2,:) = tri(i,2:3); + parit(j+3:j+5,:) = [parit(j:j+2,2) parit(j:j+2,1)]; +end +parit = unique(parit,'rows'); +[rivi,sarake] = find(parit>ncoords); %Jätetään huomiotta apupisteet +parit(rivi,:) = []; +parit = I(parit); %Otetaan poistetut takaisin mukaan +graph = sparse(parit(:,1),parit(:,2),1, ninds, ninds); + + +%Kopioidaan samassa pisteess?olevien yksilöiden naapurustot +%silt? jolle ne laitettu. + + for i = 1:length(samat); + taulu = I(samat{i}); + [rivi,sarake] = find(graph(taulu,:)>0); + if length(rivi) > 0 + kopioitava = graph(taulu(rivi(1)),:); + for j = 1:length(taulu); + graph(taulu(j),:) = kopioitava; + graph(:,taulu(j)) = kopioitava'; + end + end + end + + %Asetetaan samassa pisteess?olevat yksilöt toistensa naapureiksi + + for i = 1:length(samat) + for j = I(samat{i}) + for k = I(samat{i}) + if k ~= j + graph(j,k) = 1; + end + end + end + end + +%Laskee maksimin klikkien ja separaattorien koolle +%Määritetään myös klikit ja separaattorit + +[ncliq, nsep, cliq, sep] = laskeKlikit(graph, ninds, ninds); + +sumcliq = sum(ncliq); +sumsep = sum(nsep); +maxCliqSize = max(find(sumcliq > 0)); +maxSepSize = max(find(sumsep > 0)); + +cliques = zeros(length(cliq), maxCliqSize); +separators = zeros(length(sep), maxSepSize); + +nollia = zeros(1, length(cliq)); +for i = 1:length(cliq); + klikki = cliq{i}; + if length(klikki)>1 + cliques(i, 1:length(klikki)) = klikki; + else + nollia(i)=1; + end +end +cliques(find(nollia==1), :) = []; + +for i = 1:length(sep); + klikki = sep{i}; + separators(i, 1:length(klikki)) = klikki; +end + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%Määritetään yksilöit?vastaavat voronoi tesselaation solut + +[vorPoints, vorCells] = voronoin(new_coordinates, {'Qbb', 'Qz'}); + +bounded = ones(length(vorCells),1); +for i=1:length(vorCells) + if isempty(vorCells{i}) || length(find(vorCells{i}==1))>0 + bounded(i)=0; + end +end + + + +vorCells = vorCells(find(bounded == 1)); +pointers = cell(length(vorCells),1); +empties = zeros(1,length(vorCells)); +X = coordinates(:,1); +Y = coordinates(:,2); + +for i=1:length(pointers) + vx = vorPoints(vorCells{i},1); + vy = vorPoints(vorCells{i},2); + IN = inpolygon(X,Y,vx,vy); + if any(IN)==0 + empties(i) = 1; + else + pointers{i} = find(IN ==1)'; + end +end + +vorCells = vorCells(find(empties == 0)); +pointers = pointers(find(empties == 0)); + +%-------------------------------------------------------------------------- + +function [ncliques, nseparators, cliques, separators] = ... + laskeKlikit(M, maxCliqSize,maxSepSize) +%Laskee samankokoisten klikkien määrän verkosta M +%ncliques(i)=kokoa i olevien klikkien määr? +%nseparators vastaavasti + +ncliques=zeros(1,maxCliqSize); +nseparators=zeros(1,maxSepSize); + +if isequal(M,[]) + return; +end + +[cliques,separators]=findCliques(M); + +for i=1:length(cliques) + ncliques(length(cliques{i}))=ncliques(length(cliques{i}))+1; +end + +%cliqmax=max(find(ncliques~=0)); +%ncliques=ncliques(1:cliqmax); + +for i=1:length(separators) + nseparators(length(separators{i}))=nseparators(length(separators{i}))+1; +end + +%sepmax=max(find(nseparators~=0)); +%nseparators=nseparators(1:sepmax); + +%-------------------------------------------------------------------------- + +function C = mysetdiff(A,B) +% MYSETDIFF Set difference of two sets of positive integers (much faster than built-in setdiff) +% C = mysetdiff(A,B) +% C = A \ B = { things in A that are not in B } +% +% Original by Kevin Murphy, modified by Leon Peshkin + +if isempty(A) + C = []; + return; +elseif isempty(B) + C = A; + return; +else % both non-empty + bits = zeros(1, max(max(A), max(B))); + bits(A) = 1; + bits(B) = 0; + C = A(logical(bits(A))); +end + + +%-------------------------------------------------------------------------- + +function logml = checkLogml(priorTerm, adjprior, cliques, separators) +% tarkistaa logml:n + +global CLIQCOUNTS; +global SEPCOUNTS; +global PARTITION; + +npops = length(unique(PARTITION)); +[cliqcounts, sepcounts] = computeCounts(cliques, separators, npops); + +CLIQCOUNTS = cliqcounts; +SEPCOUNTS = sepcounts; + + +[logml, spatialPrior] = computeLogml(adjprior, priorTerm); + +disp(['logml: ' logml2String(logml) ', spatial prior: ' logml2String(spatialPrior)]); + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhji? +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; + +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end + +%-------------------------------------------------------------------------- + +function viallinen = testaaKoordinaatit(ninds, coordinates) +% Testaa onko koordinaatit kunnollisia. + +viallinen = 1; +if ~isnumeric(coordinates) + return +end + +oikeanKokoinen = (size(coordinates,1) == ninds) & (size(coordinates,2) == 2); +if oikeanKokoinen + viallinen = 0; +end + + +%-------------------------------------------------------------------------- + +function [sumcounts, counts, logml] = ... + initialCounts(partition, data, npops, rowsFromInd, noalle) + +nloci=size(data,2); +ninds = size(data,1)/rowsFromInd; + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +%-------------------------------------------------------------------------- + +function [popnames2, rowsFromInd] = findOutRowsFromInd(popnames, rows) + +ploidisuus = questdlg('Specify the type of individuals in the data: ',... + 'Individual type?', 'Haploid', 'Diploid', 'Tetraploid', ... + 'Diploid'); + +switch ploidisuus +case 'Haploid' + rowsFromInd = 1; +case 'Diploid' + rowsFromInd = 2; +case 'Tetraploid' + rowsFromInd = 4; +end + +if ~isempty(popnames) + for i = 1:size(rows,1) + popnames2{i,1} = popnames{i,1}; + rivi = rows(i,1):rows(i,2); + popnames2{i,2} = (rivi(rowsFromInd))/rowsFromInd; + end +else + popnames2 = []; +end + +%-------------------------------------------------------------------------- + +function fiksaaPartitioYksiloTasolle(rows, rowsFromInd) + +global PARTITION; +totalRows = 0; +for ind = 1:size(rows,1) + totalRows = totalRows + (rows(ind,2)-rows(ind,1)+1); +end +partitio2 = zeros(totalRows/rowsFromInd,1); + +for ind = 1:size(rows,1) + kaikkiRivit = rows(ind,1):rows(ind,2); + for riviNumero = rowsFromInd:rowsFromInd:length(kaikkiRivit) + %for riviNumero = rowsFromInd:rowsFromInd:length(rows{ind}) + %rivi = rows{ind}(riviNumero); + rivi = kaikkiRivit(riviNumero); + partitio2(rivi/rowsFromInd) = PARTITION(ind); + end +end +PARTITION = partitio2; diff --git a/matlab/parallel/spatial_parallel.m b/matlab/parallel/spatial_parallel.m new file mode 100644 index 0000000..d44b9de --- /dev/null +++ b/matlab/parallel/spatial_parallel.m @@ -0,0 +1,2175 @@ +function spatial_parallel(options) +% SPATICIAL_PARALLEL is the command line version of the baps partition with +% spaticial models. +% Input: options is a struct generated by parallel.m + +%-------------------------------------------------------------------------- +%- Syntax check out +%-------------------------------------------------------------------------- +outp = [options.outputMat '.txt']; +inp = [options.dataFile ' & ' options.coordinateFile]; + +if strcmp(options.fixedK, 'yes') + fixedK = 1; +else + fixedK = 0; +end + +%-------------------------------------------------------------------------- +%- Get data file location +%-------------------------------------------------------------------------- +switch options.dataType + case 'numeric' + %------------------------------------------------------------------ + %- Get name and index file location + %------------------------------------------------------------------ + try + data = load(options.dataFile); + catch + disp('*** ERROR: Incorrect BAPS numerical data.'); + return + end + ninds = testaaOnkoKunnollinenBapsData(data); %TESTAUS + if (ninds==0) + disp('*** ERROR: Incorrect BAPS numerical data.'); + return; + end + + try + coordinates = load(options.coordinateFile); + catch + disp('*** ERROR: Incorrect BAPS coordinate data.'); + return + end + viallinen = testaaKoordinaatit(ninds, coordinates); + if viallinen + disp('*** ERROR: Incorrect coordinates.'); + return + end + + if ~isempty(options.nameFile) && ~isempty(options.indexFile) + popnames = initPopNames(options.nameFile{1}, options.indexFile{1}); + else + popnames = []; + end + + disp('Pre-processing the data. This may take several minutes.'); + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + [Z,dist] = newGetDistances(data,rowsFromInd); + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + + case 'genepop' + kunnossa = testaaGenePopData(options.dataFile); + if kunnossa == 0 + return + end + [data,popnames] = lueGenePopData(options.dataFile); + ninds = max(data(:,end)); + coordinates = load(options.coordinateFile); + viallinen = testaaKoordinaatit(ninds, coordinates); + + if viallinen + disp('*** ERROR: Incorrect coordinates'); + return + end + + disp('Pre-processing the data. This may take several minutes.'); + + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + [Z,dist] = newGetDistances(data,rowsFromInd); + + case 'matlab' + struct_array = load(options.dataFile); + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'dist') + disp('*** ERROR: Incorrect matlab format.'); + return + end + elseif isfield(struct_array,'dist') %Mideva versio + c = struct_array; + else + disp('*** ERROR: Incorrect matlab format.'); + return; + end + + data = double(c.data); rowsFromInd = c.rowsFromInd; alleleCodes = c.alleleCodes; + noalle = c.noalle; adjprior = c.adjprior; priorTerm = c.priorTerm; + dist = c.dist; popnames = c.popnames; Z = c.Z; + if isfield(c, 'cliques') + cliques = c.cliques; separators = c.separators; + vorPoints = c.vorPoints; vorCells = c.vorCells; + pointers = c.pointers; coordinates = c.coordinates; + clear c; + else + % loading separate coordinate file + ninds = max(data(:,end)); + coordinates = load(options.coordinateFile); + viallinen = testaaKoordinaatit(ninds, coordinates); + + if viallinen + disp('*** ERROR: Incorrect coordinates.'); + return + end + + disp('Pre-processing the data. This may take several minutes.'); + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + end + otherwise + error('*** ERROR: data type is not specified or unknown.'); +end + +% --------------------------------------------------------- +% - Stochastic search algorithm +% --------------------------------------------------------- +global PARTITION; global COUNTS; +global SUMCOUNTS; +global SEPCOUNTS; global CLIQCOUNTS; +clearGlobalVars; +npopstext = []; + +npopstextExtra = options.initialK; +if length(npopstextExtra)>=255 + npopstextExtra = npopstextExtra(1:255); + npopstext = [npopstext ' ' npopstextExtra]; + teksti = 'The input field length limit (255 characters) was reached. Input more values: '; +else + % ----------------------------------------------------- + % Set the limit of the input value. + % Modified by Jing Tang, 30.12.2005 + if max(npopstextExtra) > size(data,1) + error('Values larger than the sample size are not accepted. '); + else + npopstext = [npopstext ' ' num2str(npopstextExtra)]; + end +end + +clear ready; clear teksti; +if isempty(npopstext) || length(npopstext)==1 + return +else + npopsTaulu = str2num(npopstext); + ykkoset = find(npopsTaulu==1); + npopsTaulu(ykkoset) = []; % Mikäli ykkösi?annettu ylärajaksi, ne poistetaan. + if isempty(npopsTaulu) + return + end + clear ykkoset; +end + +c.data=data; +c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; +c.dist=dist; c.Z=Z; c.rowsFromInd = rowsFromInd; +c.cliques = cliques; c.separators = separators; + +ninds = length(unique(data(:,end))); +ekat = (1:rowsFromInd:ninds*rowsFromInd)'; +c.rows = [ekat ekat+rowsFromInd-1]; + +if fixedK + % Only the first value of npopsTaulu is used + npops = npopsTaulu(1); + nruns = length(npopsTaulu); + [logml, npops, partitionSummary]=spatialMix_fixK(c,npops,nruns); +else + [logml, npops, partitionSummary]=spatialMix(c,npopsTaulu); +end + +if logml==1 + return +end + +data = noIndex(data,noalle); + +[varmuus,changesInLogml] = writeMixtureInfo(logml, rowsFromInd, data, adjprior, priorTerm, ... + outp,inp,partitionSummary, popnames, cliques, separators, fixedK); + + +viewMixPartition(PARTITION, popnames); +if isequal(popnames, []) + names = pointers; +else + names = cell(size(pointers)); + indices = zeros(size(popnames(:,2))); + for i=1:length(popnames(:,2)); + indices(i) = popnames{i,2}; + end + for i = 1:length(pointers) + inds = pointers{i}; + namesInCell = []; + for j = 1:length(inds) + ind = inds(j); + I = find(indices > ind); + if isempty(I) + nameIndex = length(indices); + else + nameIndex = min(I) -1; + end + name = popnames{nameIndex}; + namesInCell = [namesInCell name]; + end + names{i} = namesInCell; + end +end +vorPlot(vorPoints, vorCells, PARTITION, pointers, coordinates, names); + +if exist('baps4_output.baps','file') + copyfile('baps4_output.baps',outp) + delete('baps4_output.baps') +end + +c.PARTITION = PARTITION; c.COUNTS = COUNTS; c.SUMCOUNTS = SUMCOUNTS; +c.alleleCodes = alleleCodes; c.adjprior = adjprior; c.popnames = popnames; +c.rowsFromInd = rowsFromInd; c.data = data; c.npops = npops; +c.noalle = noalle; c.mixtureType = 'spatial'; +c.pointers = pointers; c.vorPoints = vorPoints; c.vorCells = vorCells; +c.coordinates = coordinates; c.varmuus = varmuus; c.names = names; +c.changesInLogml = changesInLogml; % added by jing - 22.11.2006 +c.logml = logml; % added for result comparing. + +fprintf(1,'Saving the result...') +try +% save(options.outputMat, 'c'); + save(options.outputMat, 'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + fprintf(1,'Finished.\n'); +catch + display('*** ERROR in saving the result.'); +end + +% --------------------------------------------------------- +% - Subfunctions +% --------------------------------------------------------- + + +%---------------------------------------------------------- +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; + +global SEPCOUNTS; SEPCOUNTS = []; +global CLIQCOUNTS; CLIQCOUNTS = []; + +%------------------------------------------------------------------------------------- + + +function rows = computeRows(rowsFromInd, inds, ninds) +% On annettu yksilöt inds. Funktio palauttaa vektorin, joka +% sisältää niiden rivien numerot, jotka sisältävät yksilöiden +% dataa. + +rows = inds(:, ones(1,rowsFromInd)); +rows = rows*rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [ninds 1]); +rows = rows - miinus; +rows = reshape(rows', [1,rowsFromInd*ninds]); + + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + global PARTITION; + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + + +%-------------------------------------------------------------------------- + + +function [suurin, i2] = arvoSeuraavaTila(muutokset, logml) +% Suorittaa yksilön seuraavan tilan arvonnan + +y = logml + muutokset; % siirron jälkeiset logml:t +y = y - max(y); +y = exp(y); +summa = sum(y); +y = y/summa; +y = cumsum(y); + +i2 = rand_disc(y); % uusi kori +suurin = muutokset(i2); + + +%-------------------------------------------------------------------------------------- + + +function svar=rand_disc(CDF) +%returns an index of a value from a discrete distribution using inversion method +slump=rand; +har=find(CDF>slump); +svar=har(1); + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, rowsFromInd, diffInCounts, ... + cliques, separators, adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun yksil?ind +% siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +global CLIQCOUNTS; +global SEPCOUNTS; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); +diffInSepCounts = computeDiffInCliqCounts(separators, ind); + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + +%POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2(i1, i2, rowsFromInd, diffInCounts, ... + cliques, separators, adjprior, priorTerm); +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksilöt siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +%global POP_LOGML; +global CLIQCOUNTS; +global SEPCOUNTS; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +diffInCliqCounts = CLIQCOUNTS(:,i1); +diffInSepCounts = SEPCOUNTS(:,i1); + + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = 0; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = 0; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, rowsFromInd, diffInCounts, ... + adjprior, priorTerm, i2, cliques, separators); +% Suorittaa globaalien muuttujien päivitykset, kun yksilöt 'muuttuvat' +% siirretään koriin i2. Ennen siirtoa yksilöiden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; global CLIQCOUNTS; +global SUMCOUNTS; global SEPCOUNTS; +%global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +diffInCliqCounts = computeDiffInCliqCounts(cliques, muuttuvat); +diffInSepCounts = computeDiffInCliqCounts(separators, muuttuvat); + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + + +%POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, rowsFromInd, data, adjprior, priorTerm) +% Palauttaa yksilöt järjestyksess?siten, ett?ensimmäisen?on +% se, jonka poistaminen populaatiosta pop nostaisi logml:n +% arvoa eniten. + +global COUNTS; global SUMCOUNTS; +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind = inds(i); + rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop) = COUNTS(:,:,pop)-diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)-diffInSumCounts; + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior, priorTerm); + COUNTS(:,:,pop) = COUNTS(:,:,pop)+diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)+diffInSumCounts; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset(ind, rowsFromInd, ... + data, adjprior, priorTerm, logml, cliques, separators) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?ind siirretään koriin i. +% diffInCounts on poistettava COUNTS:in siivusta i1 ja lisättäv? +% COUNTS:in siivuun i2, mikäli muutos toteutetaan. + + +global COUNTS; global SUMCOUNTS; +global PARTITION; %global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +[emptyPop, pops] = findEmptyPop(npops); + +i1 = PARTITION(ind); +i2 = [pops(find(pops~=i1))]; +if emptyPop > 0 + i2 =[i2 emptyPop]; +end + +rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); +diffInSepCounts = computeDiffInCliqCounts(separators, ind); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; + +for i=i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; +end + +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) + diffInSepCounts; + +% Asetetaan muillekin tyhjille populaatioille sama muutos, kuin +% emptyPop:lle + +if emptyPop > 0 + empties = mysetdiff((1:npops), [i2 i1]); + muutokset(empties) = muutokset(emptyPop); +end + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2(i1, rowsFromInd, ... + data, adjprior, priorTerm, logml, cliques, separators); +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli korin i1 kaikki yksilöt siirretään +% koriin i. +% Laskee muutokset vain yhdelle tyhjälle populaatiolle, muille tulee +% muutokseksi 0. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +[emptyPop, pops] = findEmptyPop(npops); + +i2 = [pops(find(pops~=i1))]; +if emptyPop > 0 + i2 =[i2 emptyPop]; +end + +inds = find(PARTITION == i1); +rows = computeRows(rowsFromInd, inds, length(inds)); + +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); +diffInCliqCounts = computeDiffInCliqCounts(cliques, inds); +diffInSepCounts = computeDiffInCliqCounts(separators, inds); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +CLIQCOUNTS(:,i1) = 0; +SEPCOUNTS(:,i1) = 0; + +for i=i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; +end + +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; +CLIQCOUNTS(:,i1) = diffInCliqCounts; +SEPCOUNTS(:,i1) = diffInSepCounts; + + + +%------------------------------------------------------------------------------------ + +function muutokset = laskeMuutokset3(T2, inds2, rowsFromInd, ... + data, adjprior, priorTerm, i1, logml, cliques, separators) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mik?olisi muutos logml:ss? jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirretään koriin j. +% Laskee vain yhden tyhjän populaation, muita kohden muutokseksi jää 0. + + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = computeRows(rowsFromInd, inds, ninds); + + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + diffInCliqCounts = computeDiffInCliqCounts(cliques, inds); + diffInSepCounts = computeDiffInCliqCounts(separators, inds); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; + SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; + + [emptyPop, pops] = findEmptyPop(npops); + i2 = [pops(find(pops~=i1))]; + if emptyPop > 0 + i2 =[i2 emptyPop]; + end + + for i = i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(pop2,i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; + end + + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) + diffInCliqCounts; + SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) + diffInSepCounts; + end +end + +%-------------------------------------------------------------------------- + +function muutokset = laskeMuutokset5(inds, rowsFromInd, data, adjprior, ... + priorTerm, logml, cliques, separators, i1, i2) + +% Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?i vaihtaisi koria i1:n ja i2:n välill? + +global COUNTS; global SUMCOUNTS; +global PARTITION; +global CLIQCOUNTS; global SEPCOUNTS; + +ninds = length(inds); +muutokset = zeros(ninds,1); +cliqsize = size(CLIQCOUNTS,2); +sepsize = size(SEPCOUNTS, 2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %mist? + pop2 = i2; %mihin + else + pop1 = i2; + pop2 = i1; + end + rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; + + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); + diffInSepCounts = computeDiffInCliqCounts(separators, ind); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)-diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts; + + CLIQCOUNTS(:,pop1) = CLIQCOUNTS(:,pop1) - diffInCliqCounts; + CLIQCOUNTS(:,pop2) = CLIQCOUNTS(:,pop2) + diffInCliqCounts; + SEPCOUNTS(:,pop1) = SEPCOUNTS(:,pop1) - diffInSepCounts; + SEPCOUNTS(:,pop2) = SEPCOUNTS(:,pop2) + diffInSepCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; + + CLIQCOUNTS(:,pop1) = CLIQCOUNTS(:,pop1) + diffInCliqCounts; + CLIQCOUNTS(:,pop2) = CLIQCOUNTS(:,pop2) - diffInCliqCounts; + SEPCOUNTS(:,pop1) = SEPCOUNTS(:,pop1) + diffInSepCounts; + SEPCOUNTS(:,pop2) = SEPCOUNTS(:,pop2) - diffInSepCounts; + +end + +%-------------------------------------------------------------------------- + +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n +% riveill?rows. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + + + +%------------------------------------------------------------------------------------ + + +function popLogml = computePopulationLogml(pops, adjprior, priorTerm) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS(pops,:)),2) - priorTerm; + +%------------------------------------------------------------------------------------ + +function npops = poistaTyhjatPopulaatiot(npops) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; +global CLIQCOUNTS; +global SEPCOUNTS; + +notEmpty = find(any(SUMCOUNTS,2)); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); +CLIQCOUNTS = CLIQCOUNTS(:,notEmpty); +SEPCOUNTS = SEPCOUNTS(:,notEmpty); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%---------------------------------------------------------------------------------- +%Seuraavat kolme funktiota liittyvat alkupartition muodostamiseen. + +function initial_partition=admixture_initialization(data_matrix,nclusters,Z) +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +T=cluster_own(Z,nclusters); +initial_partition=zeros(size_data(1),1); +for i=1:n + kori=T(i); + here=find(data_matrix(:,end)==i); + for j=1:length(here) + initial_partition(here(j),1)=kori; + end +end + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + + +%--------------------------------------------------------------------------------------- + + +function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? +% kyseinen rivi on peräisin. Funktio tutkii ensin, ett?montako +% rivi?maksimissaan on peräisin yhdelt?yksilölt? jolloin saadaan +% tietää onko kyseess?haploidi, diploidi jne... Tämän jälkeen funktio +% lisää tyhji?rivej?niille yksilöille, joilta on peräisin vähemmän +% rivej?kuin maksimimäär? +% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin +% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytöss?oleva koodi. +% Tämän jälkeen funktio muuttaa alleelikoodit siten, ett?yhden lokuksen j +% koodit saavat arvoja välill?1,...,noalle(j). +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +dataApu = []; nollat = []; isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +nrows = size(data,1); +ncols = size(data,2); +rowsFromInd = zeros(nind,1); +for i=1:nind + rowsFromInd(i) = length(find(data(:,end)==i)); +end +maxRowsFromInd = max(rowsFromInd); +a = -999; +emptyRow = repmat(a, 1, ncols); +lessThanMax = find(rowsFromInd < maxRowsFromInd); +missingRows = maxRowsFromInd*nind - nrows; +data = [data; zeros(missingRows, ncols)]; +pointer = 1; +for ind=lessThanMax' %Käy läpi ne yksilöt, joilta puuttuu rivej? + miss = maxRowsFromInd-rowsFromInd(ind); % Tält?yksilölt?puuttuvien lkm. + for j=1:miss + rowToBeAdded = emptyRow; + rowToBeAdded(end) = ind; + data(nrows+pointer, :) = rowToBeAdded; + pointer = pointer+1; + end +end +data = sortrows(data, ncols); % Sorttaa yksilöiden mukaisesti +newData = data; +rowsFromInd = maxRowsFromInd; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + + +%---------------------------------------------------------------------------------------- + + +function [Z, dist] = newGetDistances(data, rowsFromInd) + +ninds = max(data(:,end)); +nloci = size(data,2)-1; +riviLkm = nchoosek(ninds,2); + +empties = find(data<0); +data(empties)=0; +data = uint8(data); % max(noalle) oltava <256 + +pariTaulu = zeros(riviLkm,2); +aPointer=1; +for a=1:ninds-1 + pariTaulu(aPointer:aPointer+ninds-1-a,1) = ones(ninds-a,1)*a; + pariTaulu(aPointer:aPointer+ninds-1-a,2) = (a+1:ninds)'; + aPointer = aPointer+ninds-a; +end + +eka = pariTaulu(:,ones(1,rowsFromInd)); +eka = eka * rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]); +eka = eka - miinus; + +toka = pariTaulu(:,ones(1,rowsFromInd)*2); +toka = toka * rowsFromInd; +toka = toka - miinus; + +%eka = uint16(eka); +%toka = uint16(toka); + +summa = zeros(riviLkm,1); +vertailuja = zeros(riviLkm,1); + +clear pariTaulu; clear miinus; + +x = zeros(size(eka)); x = uint8(x); +y = zeros(size(toka)); y = uint8(y); + +for j=1:nloci; + + for k=1:rowsFromInd + x(:,k) = data(eka(:,k),j); + y(:,k) = data(toka(:,k),j); + end + + for a=1:rowsFromInd + for b=1:rowsFromInd + vertailutNyt = double(x(:,a)>0 & y(:,b)>0); + vertailuja = vertailuja + vertailutNyt; + lisays = (x(:,a)~=y(:,b) & vertailutNyt); + summa = summa+double(lisays); + end + end +end + +clear x; clear y; clear vertailutNyt; +nollat = find(vertailuja==0); +dist = zeros(length(vertailuja),1); +dist(nollat) = 1; +muut = find(vertailuja>0); +dist(muut) = summa(muut)./vertailuja(muut); +clear summa; clear vertailuja; + +Z = linkage(dist'); + + +%---------------------------------------------------------------------------------------- + + +function [Z, distances]=getDistances(data_matrix,nclusters) + +%finds initial admixture clustering solution with nclusters clusters, uses simple mean Hamming distance +%gives partition in 8-bit format +%allocates all alleles of a single individual into the same basket +%data_matrix contains #Loci+1 columns, last column indicate whose alleles are placed in each row, +%i.e. ranges from 1 to #individuals. For diploids there are 2 rows per individual, for haploids only a single row +%missing values are indicated by zeros in the partition and by negative integers in the data_matrix. + +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +distances=zeros(nchoosek(n,2),1); +pointer=1; +for i=1:n-1 + i_data=data_matrix(find(data_matrix(:,end)==i),1:nloci); + for j=i+1:n + d_ij=0; + j_data=data_matrix(find(data_matrix(:,end)==j),1:nloci); + vertailuja = 0; + for k=1:size(i_data,1) + for l=1:size(j_data,1) + here_i=find(i_data(k,:)>=0); + here_j=find(j_data(l,:)>=0); + here_joint=intersect(here_i,here_j); + vertailuja = vertailuja + length(here_joint); + d_ij = d_ij + length(find(i_data(k,here_joint)~=j_data(l,here_joint))); + end + end + d_ij = d_ij / vertailuja; + distances(pointer)=d_ij; + pointer=pointer+1; + end +end + +Z=linkage(distances'); + + + +%---------------------------------------------------------------------------------------- + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile, indexFile) +%Palauttaa tyhjän, mikäli nimitiedosto ja indeksitiedosto +% eivät olleet yht?pitki? + +popnames = []; +indices = load(indexFile); + +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) & ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +if length(names) ~= length(indices) + disp('The number of population names must be equal to the number of '); + disp('entries in the file specifying indices of the first individuals of '); + disp('each population.'); + return; +end + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = indices(i); +end + + +%----------------------------------------------------------------------------------- +% Laskee arvot cliqcounts:lle ja sepcounts:lle + +function [cliqcounts, sepcounts] = computeCounts(cliques, separators, npops) + +global PARTITION; +ncliq = size(cliques,1); +nsep = size(separators,1); + +cliqPartition = zeros(ncliq, size(cliques,2)); +sepPartition = zeros(nsep, size(separators, 2)); + +apuCliq = find(cliques > 0); +apuSep = find(separators > 0); + +cliqPartition(apuCliq) = PARTITION(cliques(apuCliq)); +sepPartition(apuSep) = PARTITION(separators(apuSep)); + + +cliqcounts = zeros(ncliq, npops); +for i = 1:npops + cliqcounts(:,i) = sum(cliqPartition == i, 2); +end + + +sepcounts = zeros(nsep, npops); +for i = 1:npops + sepcounts(:,i) = sum(sepPartition == i, 2); +end + +%------------------------------------------------------------------------- + +function diffInCliqCounts = computeDiffInCliqCounts(cliques, inds) +% Laskee muutoksen CLIQCOUNTS:ssa (tai SEPCOUNTS:ssa, jos syötteen? +% separators) kun yksilöt inds siirretään. +% diffInCliqcounts on ncliq*1 taulu, joka on CLIQCOUNTS:n sarakkeesta josta +% yksilöt inds siirretään ja lisättäv?sarakkeeseen, johon yksilöt +% siirretään. + +ncliq = size(cliques,1); +diffInCliqCounts = zeros(ncliq,1); +ninds = length(inds); +for i = 1:ninds + ind = inds(i); + rivit = sum((cliques == ind),2); + diffInCliqCounts = diffInCliqCounts + rivit; +end + + +%----------------------------------------------------------------------- + +function [logml, spatialPrior] = computeLogml(adjprior,priorTerm) + +%global GAMMA_LN; +global CLIQCOUNTS; +global SEPCOUNTS; +%global PARTITION; + +notEmpty = any(CLIQCOUNTS); +npops = length(find(notEmpty == 1)); +sumcliq=sum(CLIQCOUNTS, 2); +sumsep=sum(SEPCOUNTS, 2); +ncliq = size(CLIQCOUNTS, 1); +nsep = size(SEPCOUNTS, 1); + +cliqsizes = sum(CLIQCOUNTS, 2)'; +sepsizes = sum(SEPCOUNTS, 2)'; +cliqsizes = min([cliqsizes; npops*ones(1,ncliq)])'; +sepsizes = min([sepsizes; npops*ones(1,nsep)])'; + +klikkitn = sum(sum(gammaln(CLIQCOUNTS(:,notEmpty) + repmat(1./cliqsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./cliqsizes))) ... + - sum(gammaln(sumcliq + 1)); + +septn = sum(sum(gammaln(SEPCOUNTS(:,notEmpty) + repmat(1./sepsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./sepsizes))) ... + - sum(gammaln(sumsep + 1)); + +spatialPrior = (klikkitn - septn); + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = size(COUNTS,3); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 z]) + COUNTS) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS),2) - priorTerm; + +logml = sum(popLogml) + spatialPrior; + +%-------------------------------------------------------------------------- + + +function initializeGammaln(ninds, rowsFromInd, maxSize) +%Alustaa GAMMALN muuttujan s.e. GAMMALN(i,j)=gammaln((i-1) + 1/j) +global GAMMA_LN; +GAMMA_LN = zeros((1+ninds)*rowsFromInd, maxSize); +for i=1:(ninds+1)*rowsFromInd + for j=1:maxSize + GAMMA_LN(i,j)=gammaln((i-1) + 1/j); + end +end + + +%---------------------------------------------------------------------------- + + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäär? + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + + +%---------------------------------------------------------------------------- + + + +function kunnossa = testaaGenePopData(tiedostonNimi) +% kunnossa == 0, jos data ei ole kelvollinen genePop data. +% Muussa tapauksessa kunnossa == 1. + +kunnossa = 0; +fid = fopen(tiedostonNimi); +line1 = fgetl(fid); %ensimmäinen rivi +line2 = fgetl(fid); %toinen rivi +line3 = fgetl(fid); %kolmas + +if (isequal(line1,-1) | isequal(line2,-1) | isequal(line3,-1)) + disp('Incorrect file format 1168'); fclose(fid); + return +end +if (testaaPop(line1)==1 | testaaPop(line2)==1) + disp('Incorrect file format 1172'); fclose(fid); + return +end +if testaaPop(line3)==1 + %2 rivi tällöin lokusrivi + nloci = rivinSisaltamienMjonojenLkm(line2); + line4 = fgetl(fid); + if isequal(line4,-1) + disp('Incorrect file format 1180'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin nelj?täytyy sisältää pilkku. + disp('Incorrect file format 1185'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy + pointer = pointer+1; + end + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1195'); fclose(fid); + return + end +else + line = fgetl(fid); + lineNumb = 4; + while (testaaPop(line)~=1 & ~isequal(line,-1)) + line = fgetl(fid); + lineNumb = lineNumb+1; + end + if isequal(line,-1) + disp('Incorrect file format 1206'); fclose(fid); + return + end + nloci = lineNumb-2; + line4 = fgetl(fid); %Eka rivi pop sanan jälkeen + if isequal(line4,-1) + disp('Incorrect file format 1212'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin täytyy sisältää pilkku. + disp('Incorrect file format 1217'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy. + pointer = pointer+1; + end + + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1228'); fclose(fid); + return + end +end +kunnossa = 1; +fclose(fid); + +%------------------------------------------------------ + + +function [data, popnames] = lueGenePopData(tiedostonNimi) + +fid = fopen(tiedostonNimi); +line = fgetl(fid); %ensimmäinen rivi +line = fgetl(fid); %toinen rivi +count = rivinSisaltamienMjonojenLkm(line); + +line = fgetl(fid); +lokusRiveja = 1; +while (testaaPop(line)==0) + lokusRiveja = lokusRiveja+1; + line = fgetl(fid); +end + +if lokusRiveja>1 + nloci = lokusRiveja; +else + nloci = count; +end + +popnames = cell(10,2); +data = zeros(100, nloci+1); +nimienLkm=0; +ninds=0; +poimiNimi=1; +digitFormat = -1; +while line ~= -1 + line = fgetl(fid); + + if poimiNimi==1 + %Edellinen rivi oli 'pop' + nimienLkm = nimienLkm+1; + ninds = ninds+1; + if nimienLkm>size(popnames,1); + popnames = [popnames; cell(10,2)]; + end + nimi = lueNimi(line); + if digitFormat == -1 + digitFormat = selvitaDigitFormat(line); + divider = 10^digitFormat; + end + popnames{nimienLkm, 1} = {nimi}; %Näin se on greedyMix:issäkin?!? + popnames{nimienLkm, 2} = ninds; + poimiNimi=0; + + data = addAlleles(data, ninds, line, divider); + + elseif testaaPop(line) + poimiNimi = 1; + + elseif line ~= -1 + ninds = ninds+1; + data = addAlleles(data, ninds, line, divider); + end +end + +data = data(1:ninds*2,:); +popnames = popnames(1:nimienLkm,:); +fclose(fid); + + +%------------------------------------------------------- + +function nimi = lueNimi(line) +%Palauttaa line:n alusta sen osan, joka on ennen pilkkua. +n = 1; +merkki = line(n); +nimi = ''; +while ~isequal(merkki,',') + nimi = [nimi merkki]; + n = n+1; + merkki = line(n); +end + +%------------------------------------------------------- + +function df = selvitaDigitFormat(line) +% line on ensimmäinen pop-sanan jälkeinen rivi +% Genepop-formaatissa olevasta datasta. funktio selvittää +% rivin muodon perusteella, ovatko datan alleelit annettu +% 2 vai 3 numeron avulla. + +n = 1; +merkki = line(n); +while ~isequal(merkki,',') + n = n+1; + merkki = line(n); +end + +while ~any(merkki == '0123456789'); + n = n+1; + merkki = line(n); +end +numeroja = 0; +while any(merkki == '0123456789'); + numeroja = numeroja+1; + n = n+1; + merkki = line(n); +end + +df = numeroja/2; + + +%------------------------------------------------------ + + +function count = rivinSisaltamienMjonojenLkm(line) +% Palauttaa line:n sisältämien mjonojen lukumäärän. +% Mjonojen väliss?täytyy olla välilyönti. +count = 0; +pit = length(line); +tila = 0; %0, jos odotetaan välilyöntej? 1 jos odotetaan muita merkkej? +for i=1:pit + merkki = line(i); + if (isspace(merkki) & tila==0) + %Ei tehd?mitään. + elseif (isspace(merkki) & tila==1) + tila = 0; + elseif (~isspace(merkki) & tila==0) + tila = 1; + count = count+1; + elseif (~isspace(merkki) & tila==1) + %Ei tehd?mitään + end +end + +%------------------------------------------------------- + +function pal = testaaPop(rivi) +% pal=1, mikäli rivi alkaa jollain seuraavista +% kirjainyhdistelmist? Pop, pop, POP. Kaikissa muissa +% tapauksissa pal=0. + +if length(rivi)<3 + pal = 0; + return +end +if (all(rivi(1:3)=='Pop') | ... + all(rivi(1:3)=='pop') | ... + all(rivi(1:3)=='POP')) + pal = 1; + return +else + pal = 0; + return +end + + +%-------------------------------------------------------- + + +function data = addAlleles(data, ind, line, divider) +% Lisaa BAPS-formaatissa olevaan datataulukkoon +% yksilöä ind vastaavat rivit. Yksilön alleelit +% luetaan genepop-formaatissa olevasta rivist? +% line. Jos data on 3 digit formaatissa on divider=1000. +% Jos data on 2 digit formaatissa on divider=100. + +nloci = size(data,2)-1; +if size(data,1) < 2*ind + data = [data; zeros(100,nloci+1)]; +end + +k=1; +merkki=line(k); +while ~isequal(merkki,',') + k=k+1; + merkki=line(k); +end +line = line(k+1:end); +clear k; clear merkki; + +alleeliTaulu = sscanf(line,'%d'); + +if length(alleeliTaulu)~=nloci + disp('Incorrect data format.'); +end + +for j=1:nloci + ekaAlleeli = floor(alleeliTaulu(j)/divider); + if ekaAlleeli==0 ekaAlleeli=-999; end; + tokaAlleeli = rem(alleeliTaulu(j),divider); + if tokaAlleeli==0 tokaAlleeli=-999; end + + data(2*ind-1,j) = ekaAlleeli; + data(2*ind,j) = tokaAlleeli; +end + +data(2*ind-1,end) = ind; +data(2*ind,end) = ind; + +%------------------------------------------------------------------- + + +function [varmuus,changesInLogml] = writeMixtureInfo(logml, rowsFromInd, data, adjprior, ... + priorTerm, outPutFile, inputFile, partitionSummary, popnames, ... + cliques, separators, fixedK) + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global LOGDIFF; +ninds = size(data,1)/rowsFromInd; +npops = size(COUNTS,3); +names = (size(popnames,1) == ninds); %Tarkistetaan ett?nimet viittaavat yksilöihin + +if length(outPutFile)>0 + fid = fopen(outPutFile,'w'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:'); +disp(['Data file: ' inputFile]); +disp(['Number of clustered individuals: ' ownNum2Str(ninds)]); +disp(['Number of groups in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%s \n', ['RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Data file: ' inputFile]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clustered individuals: ' ownNum2Str(ninds)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of groups in optimal partition: ' ownNum2Str(npops)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); fprintf(fid,'\n'); +end + +cluster_count = length(unique(PARTITION)); +disp(['Best Partition: ']); +if (fid ~= -1) + fprintf(fid,'%s \n',['Best Partition: ']); fprintf(fid,'\n'); +end +for m=1:cluster_count + indsInM = find(PARTITION==m); + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + if names + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + else + text = ['Cluster ' num2str(m) ': {' num2str(indsInM(1))]; + for k = 2:cluster_size + text = [text ', ' num2str(indsInM(k))]; + end; + end + text = [text '}']; + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',[new_line]); + fprintf(fid,'\n'); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',[text]); + fprintf(fid,'\n'); + end + end +end + +disp(' '); +disp(' '); +disp('Changes in log(marginal likelihood) if indvidual i is moved to group j:'); +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Changes in log(marginal likelihood) if indvidual i is moved to group j:']); fprintf(fid, '\n'); +end + +if names + nameSizes = zeros(ninds,1); + for i = 1:ninds + nimi = char(popnames{i}); + nameSizes(i) = length(nimi); + end + maxSize = max(nameSizes); + maxSize = max(maxSize, 5); + erotus = maxSize - 5; + alku = blanks(erotus); + ekarivi = [alku ' ind' blanks(6+erotus)]; +else + ekarivi = ' ind '; +end + +for i = 1:cluster_count + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; +end +disp(ekarivi); +if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); fprintf(fid, '\n'); +end + +ninds = size(data,1)/rowsFromInd; +varmuus = zeros(ninds,1); + +changesInLogml = LOGDIFF'; +for ind = 1:ninds + %[muutokset, diffInCounts] = laskeMuutokset(ind, rowsFromInd, data, ... + % adjprior, priorTerm, logml, cliques, separators); + %changesInLogml(:,ind) = muutokset; + muutokset = changesInLogml(:,ind); + if sum(exp(muutokset))>0 + varmuus(ind) = 1 - 1/sum(exp(muutokset)); + else + varmuus(ind) = 0; + end + if names + nimi = char(popnames{ind}); + rivi = [blanks(maxSize - length(nimi)) nimi ':']; + else + rivi = [blanks(4-floor(log10(ind))) ownNum2Str(ind) ':']; + end + for j = 1:npops + rivi = [rivi ' ' logml2String(omaRound(muutokset(j)))]; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + end +end + +disp(' '); disp(' '); +disp('KL-divergence matrix in PHYLIP format:'); +dist_mat = zeros(npops, npops); +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['KL-divergence matrix in PHYLIP format:']); %fprintf(fid, '\n'); +end + +maxnoalle = size(COUNTS,1); +nloci = size(COUNTS,2); +d = zeros(maxnoalle, nloci, npops); +prior = adjprior; +prior(find(prior==1))=0; +nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. +prior(1,nollia)=1; +for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); +end +% ekarivi = blanks(7); +% for pop = 1:npops +% ekarivi = [ekarivi num2str(pop) blanks(7-floor(log10(pop)))]; +% end +ekarivi = num2str(npops); +disp(ekarivi); + +if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); %fprintf(fid, '\n'); +end + +for pop1 = 1:npops + rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + % rivi = [rivi kldiv2str(div) ' ']; + dist_mat(pop1,pop2) = div; + end +% disp(rivi); +% if (fid ~= -1) +% fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); +% end +end + +dist_mat = dist_mat + dist_mat'; % make it symmetric +for pop1 = 1:npops + rivi = ['Cluster_' num2str(pop1) ' ']; + for pop2 = 1:npops + rivi = [rivi kldiv2str(dist_mat(pop1,pop2)) ' ']; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); %fprintf(fid, '\n'); + end +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['List of sizes of 10 best visited partitions and corresponding log(ml) values']); fprintf(fid, '\n'); +end + +partitionSummaryKaikki = partitionSummary; +partitionSummary =[]; +for i=1:size(partitionSummaryKaikki,3) + partitionSummary = [partitionSummary; partitionSummaryKaikki(:,:,i)]; +end +[I,J] = find(partitionSummaryKaikki(:,2,:)>-1e49); +partitionSummaryKaikki = partitionSummaryKaikki(I,:,:); + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +partitionSummary = partitionSummary(find(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1)) ' ' num2str(partitionSummary(part,2))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end +end + +if ~fixedK + disp(' '); + disp(' '); + disp('Probabilities for number of clusters'); + + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Probabilities for number of clusters']); fprintf(fid, '\n'); + end + + npopsTaulu = unique(partitionSummary(:,1)); + len = length(npopsTaulu); + probs = zeros(len,1); + partitionSummary(:,2) = partitionSummary(:,2)-max(partitionSummary(:,2)); + sumtn = sum(exp(partitionSummary(:,2))); + for i=1:len + npopstn = sum(exp(partitionSummary(find(partitionSummary(:,1)==npopsTaulu(i)),2))); + probs(i) = npopstn / sumtn; + end + for i=1:len + if probs(i)>1e-5 + line = [num2str(npopsTaulu(i)) ' ' num2str(probs(i))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end + end + end +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); + +%-------------------------------------------------------------------- + + +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) & n=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + + +%-------------------------------------------------------------------------- + + +function ninds = testaaOnkoKunnollinenBapsData(data) +%Tarkastaa onko viimeisess?sarakkeessa kaikki +%luvut 1,2,...,n johonkin n:ään asti. +%Tarkastaa lisäksi, ett?on vähintään 2 saraketta. +if size(data,1)<2 + ninds = 0; return; +end +lastCol = data(:,end); +ninds = max(lastCol); +if ~isequal((1:ninds)',unique(lastCol)) + ninds = 0; return; +end + + +%-------------------------------------------------------------------------- + +function [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates) +%Laskee yksilöiden luonnolliset naapurit koordinaateista. +%Naapurit lasketaan lisäämäll?koordinaatteihin pisteit? +%jotta kutakin yksilöä vastaisi rajoitettu voronoi-solu +%Puuttuvat koordinaatit (0,0) tulevat erakkopisteiksi +% +%Määrittää lisäksi yksilöit?vastaavat voronoi tesselaation solut. +%vorPoints:ssa on solujen kulmapisteet ja vorCells:ss?kunkin solun +%kulmapisteiden indeksit. Pointers{i} sisältää solussa i olevien yksilöiden +%indeksit. + + + +ninds = length(coordinates); +[I,J] = find(coordinates>0 | coordinates <0); %Käsitellään vain yksilöit? joilta koordinaatit +I = unique(I); %olemassa +ncoords = length(I); +new_coordinates = addPoints(coordinates(I,:)); %Ympäröidään yksilöt apupisteill? + +apuData = [new_coordinates(1:ncoords,:) (1:ncoords)']; +apuData = sortrows(apuData,[1 2]); +erot = [diff(apuData(:,1)) diff(apuData(:,2))]; +empties = find(erot(:,1)==0 & erot(:,2)==0); +samat = cell(length(empties),1); +pointer = 0; + +for i = 1:length(empties) + if i == 1 | empties(i) - empties(i-1) > 1 %Tutkitaan onko eri pisteess?kuin edellinen + pointer = pointer+1; + samat{pointer} = [apuData(empties(i),3) apuData(empties(i)+1,3)]; + else + samat{pointer} = [samat{pointer} apuData(empties(i)+1,3)]; + end +end + +samat = samat(1:pointer); +erot = []; apuData = []; empties = []; + +tri = delaunay(new_coordinates(:,1), new_coordinates(:,2), {'Qt','Qbb','Qc','Qz'}); %Apupisteiden takia ok. +%[rivi,sarake] = find(tri>ncoords); %Jätetään huomiotta apupisteet +%tri(rivi,:) = []; +pituus = tri(:,1); +pituus = length(pituus); +parit = zeros(6*pituus,2); +for i = 1:pituus %Muodostetaan kolmikoista parit + j = 6*(i-1)+1; + parit(j,:) = tri(i,1:2); + parit(j+1,:) = tri(i,1:2:3); + parit(j+2,:) = tri(i,2:3); + parit(j+3:j+5,:) = [parit(j:j+2,2) parit(j:j+2,1)]; +end +parit = unique(parit,'rows'); +[rivi,sarake] = find(parit>ncoords); %Jätetään huomiotta apupisteet +parit(rivi,:) = []; +parit = I(parit); %Otetaan poistetut takaisin mukaan +graph = sparse(parit(:,1),parit(:,2),1, ninds, ninds); + + +%Kopioidaan samassa pisteess?olevien yksilöiden naapurustot +%silt? jolle ne laitettu. + +for i = 1:length(samat); + taulu = I(samat{i}); + [rivi,sarake] = find(graph(taulu,:)>0); + if length(rivi) > 0 + kopioitava = graph(taulu(rivi(1)),:); + for j = 1:length(taulu); + graph(taulu(j),:) = kopioitava; + graph(:,taulu(j)) = kopioitava'; + end + end +end + +%Asetetaan samassa pisteess?olevat yksilöt toistensa naapureiksi + +for i = 1:length(samat) + for j = I(samat{i}) + for k = I(samat{i}) + if k ~= j + graph(j,k) = 1; + end + end + end +end + +%Laskee maksimin klikkien ja separaattorien koolle +%Määritetään myös klikit ja separaattorit + +[ncliq, nsep, cliq, sep] = laskeKlikit(graph, ninds, ninds); + +sumcliq = sum(ncliq); +sumsep = sum(nsep); +maxCliqSize = max(find(sumcliq > 0)); +maxSepSize = max(find(sumsep > 0)); + +cliques = zeros(length(cliq), maxCliqSize); +separators = zeros(length(sep), maxSepSize); + +nollia = zeros(1, length(cliq)); +for i = 1:length(cliq); + klikki = cliq{i}; + if length(klikki)>1 + cliques(i, 1:length(klikki)) = klikki; + else + nollia(i)=1; + end +end +cliques(find(nollia==1), :) = []; + + +for i = 1:length(sep); + klikki = sep{i}; + separators(i, 1:length(klikki)) = klikki; +end + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%Määritetään yksilöit?vastaavat voronoi tesselaation solut + + + +[vorPoints, vorCells] = voronoin(new_coordinates, {'Qbb', 'Qz'}); + + +bounded = ones(length(vorCells),1); +for i=1:length(vorCells) + if (isempty(vorCells{i})) | (length(find(vorCells{i}==1))>0) + bounded(i)=0; + end +end + + + +vorCells = vorCells(bounded == 1); + +pointers = cell(length(vorCells),1); +empties = zeros(1,length(vorCells)); +X = coordinates(:,1); +Y = coordinates(:,2); + +for i=1:length(pointers) + vx = vorPoints(vorCells{i},1); + vy = vorPoints(vorCells{i},2); + IN = inpolygon(X,Y,vx,vy); + if any(IN)==0 + empties(i) = 1; + else + pointers{i} = find(IN ==1); + end +end + + +%figure +%hold on +% +%for i = 1:length(vorCells) +% vx = vorPoints(vorCells{i},1); +% vy = vorPoints(vorCells{i},2); +% k = convhull(vx,vy); +% if any(pointers{i}) +% patch(vx(k), vy(k),'y'); +% else +% plot(vx(k), vy(k)); +% end +%end + +%plot(coordinates(:,1), coordinates(:,2), 'r*'); +%plot(new_coordinates(ninds+1:end,1), new_coordinates(ninds+1:end,2), 'b+'); +%axis([-2 7 -2 8]); + +vorCells = vorCells(find(empties == 0)); +pointers = pointers(find(empties == 0)); + + + + +%-------------------------------------------------------------------------- + +function [ncliques, nseparators, cliques, separators] = ... + laskeKlikit(M, maxCliqSize,maxSepSize) +%Laskee samankokoisten klikkien määrän verkosta M +%ncliques(i)=kokoa i olevien klikkien määr? +%nseparators vastaavasti + +ncliques=zeros(1,maxCliqSize); +nseparators=zeros(1,maxSepSize); + +if isequal(M,[]) + return; +end + +[cliques,separators]=findCliques(M); + +for i=1:length(cliques) + ncliques(length(cliques{i}))=ncliques(length(cliques{i}))+1; +end + +%cliqmax=max(find(ncliques~=0)); +%ncliques=ncliques(1:cliqmax); + +for i=1:length(separators) + nseparators(length(separators{i}))=nseparators(length(separators{i}))+1; +end + +%sepmax=max(find(nseparators~=0)); +%nseparators=nseparators(1:sepmax); + +%-------------------------------------------------------------------------- + +function C = mysetdiff(A,B) +% MYSETDIFF Set difference of two sets of positive integers (much faster than built-in setdiff) +% C = mysetdiff(A,B) +% C = A \ B = { things in A that are not in B } +% +% Original by Kevin Murphy, modified by Leon Peshkin + +if isempty(A) + C = []; + return; +elseif isempty(B) + C = A; + return; +else % both non-empty + bits = zeros(1, max(max(A), max(B))); + bits(A) = 1; + bits(B) = 0; + C = A(logical(bits(A))); +end + + +%-------------------------------------------------------------------------- + +function logml = checkLogml(priorTerm, adjprior, cliques, separators) +% tarkistaa logml:n + +global CLIQCOUNTS; +global SEPCOUNTS; +global PARTITION; + +npops = length(unique(PARTITION)); +[cliqcounts, sepcounts] = computeCounts(cliques, separators, npops); + +CLIQCOUNTS = cliqcounts; +SEPCOUNTS = sepcounts; + + +[logml, spatialPrior] = computeLogml(adjprior, priorTerm); + +disp(['logml: ' logml2String(logml) ', spatial prior: ' logml2String(spatialPrior)]); + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhji? +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end + +%-------------------------------------------------------------------------- + +function viallinen = testaaKoordinaatit(ninds, coordinates) +% Testaa onko koordinaatit kunnollisia. + +viallinen = 1; +if ~isnumeric(coordinates) + return +end + +oikeanKokoinen = (size(coordinates,1) == ninds) & (size(coordinates,2) == 2); +if oikeanKokoinen + viallinen = 0; +end + +%-------------------------------------------------------------------------- + +function varmuus = laskeVarmuus(rowsFromInd, data, adjprior, priorTerm, ... + logml, cliques, separators, ninds); + +varmuus = zeros(ninds,1); + +for ind=1:ninds + muutokset = laskeMuutokset(ind, rowsFromInd, data, adjprior, ... + priorTerm, logml, cliques, separators); + varmuus(ind) = 1/sum(exp(muutokset)); + +end + + + diff --git a/matlab/parallel/tulostaAdmixtureTiedot.m b/matlab/parallel/tulostaAdmixtureTiedot.m new file mode 100644 index 0000000..cf84ff9 --- /dev/null +++ b/matlab/parallel/tulostaAdmixtureTiedot.m @@ -0,0 +1,63 @@ +%-------------------------------------------------------------------------- + + +function tulostaAdmixtureTiedot(proportions, uskottavuus, alaRaja, niter) +h0 = findobj('Tag','filename1_text'); +inputf = get(h0,'String'); +h0 = findobj('Tag','filename2_text'); +outf = get(h0,'String'); clear h0; + +if length(outf)>0 + fid = fopen(outf,'a'); +else + fid = -1; + diary('baps4_output.baps'); % save in text anyway. +end + +ninds = length(uskottavuus); +npops = size(proportions,2); +disp(' '); +dispLine; +disp('RESULTS OF ADMIXTURE ANALYSIS BASED'); +disp('ON MIXTURE CLUSTERING OF INDIVIDUALS'); +disp(['Data file: ' inputf]); +disp(['Number of individuals: ' num2str(ninds)]); +disp(['Results based on ' num2str(niter) ' simulations from posterior allele frequencies.']); +disp(' '); +if fid ~= -1 + fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['--------------------------------------------']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['RESULTS OF ADMIXTURE ANALYSIS BASED']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['ON MIXTURE CLUSTERING OF INDIVIDUALS']); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Data file: ' inputf]); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Number of individuals: ' num2str(ninds)]); fprintf(fid, '\n'); + fprintf(fid,'%s \n', ['Results based on ' num2str(niter) ' simulations from posterior allele frequencies.']); fprintf(fid, '\n'); + fprintf(fid, '\n'); +end + +ekaRivi = blanks(6); +for pop = 1:npops + ekaRivi = [ekaRivi blanks(3-floor(log10(pop))) num2str(pop) blanks(2)]; +end +ekaRivi = [ekaRivi blanks(1) 'p']; % Added on 29.08.06 +disp(ekaRivi); +for ind = 1:ninds + rivi = [num2str(ind) ':' blanks(4-floor(log10(ind)))]; + if any(proportions(ind,:)>0) + for pop = 1:npops-1 + rivi = [rivi proportion2str(proportions(ind,pop)) blanks(2)]; + end + rivi = [rivi proportion2str(proportions(ind,npops)) ': ']; + rivi = [rivi ownNum2Str(uskottavuus(ind))]; + end + disp(rivi); + if fid ~= -1 + fprintf(fid,'%s \n',[rivi]); fprintf(fid,'\n'); + end +end +if fid ~= -1 + fclose(fid); +else + diary off +end + diff --git a/matlab/parallel/uipickfiles.m b/matlab/parallel/uipickfiles.m new file mode 100644 index 0000000..b3120ab --- /dev/null +++ b/matlab/parallel/uipickfiles.m @@ -0,0 +1,803 @@ +function out = uipickfiles(varargin) +%uipickfiles: GUI program to select file(s) and/or directories. +% +% Syntax: +% files = uipickfiles('PropertyName',PropertyValue,...) +% +% The current directory can be changed by operating in the file navigator: +% double-clicking on a directory in the list to move further down the tree, +% using the popup menu to move up the tree, typing a path in the box to +% move to any directory or right-clicking on the path box to revisit a +% previously-listed directory. +% +% Files can be added to the list by double-clicking or selecting files +% (non-contiguous selections are possible with the control key) and +% pressing the Add button. Files in the list can be removed or re-ordered. +% When finished, a press of the Done button will return the full paths to +% the selected files in a cell array, structure array or character array. +% If the Cancel button is pressed then zero is returned. +% +% The following optional property/value pairs can be specified as arguments +% to control the indicated behavior: +% +% Property Value +% ---------- ---------------------------------------------------------- +% FilterSpec String to specify starting directory and/or file filter. +% Ex: 'C:\bin' will start up in that directory. '*.txt' +% will list only files ending in '.txt'. 'c:\bin\*.txt' will +% do both. Default is to start up in the current directory +% and list all files. Can be changed with the GUI. +% +% REFilter String containing a regular expression used to filter the +% file list. Ex: '\.m$|\.mat$' will list files ending in +% '.m' and '.mat'. Default is empty string. Can be used +% with FilterSpec and both filters are applied. Can be +% changed with the GUI. +% +% Prompt String containing a prompt appearing in the title bar of +% the figure. Default is 'Select files'. +% +% NumFiles Scalar or vector specifying number of files that must be +% selected. A scalar specifies an exact value; a two-element +% vector can be used to specify a range, [min max]. The +% function will not return unless the specified number of +% files have been chosen. Default is [] which accepts any +% number of files. +% +% Output String specifying the data type of the output: 'cell', +% 'struct' or 'char'. Specifying 'cell' produces a cell +% array of strings, the strings containing the full paths of +% the chosen files. 'Struct' returns a structure array like +% the result of the dir function except that the 'name' field +% contains a full path instead of just the file name. 'Char' +% returns a character array of the full paths. This is most +% useful when you have just one file and want it in a string +% instead of a cell array containing just one string. The +% default is 'cell'. +% +% All properties and values are case-insensitive and need only be +% unambiguous. For example, +% +% files = uipickfiles('num',1,'out','ch') +% +% is valid usage. + +% Version: 1.0, 25 April 2006 +% Author: Douglas M. Schwarz +% Email: dmschwarz=ieee*org, dmschwarz=urgrad*rochester*edu +% Real_email = regexprep(Email,{'=','*'},{'@','.'}) + + +% Define properties and set default values. +prop.filterspec = '*'; +prop.refilter = ''; +prop.prompt = 'Select files'; +prop.numfiles = []; +prop.output = 'cell'; + +% Process inputs and set prop fields. +properties = fieldnames(prop); +arg_index = 1; +while arg_index <= nargin + arg = varargin{arg_index}; + if ischar(arg) + prop_index = find(strncmpi(arg,properties,length(arg))); + if length(prop_index) == 1 + prop.(properties{prop_index}) = varargin{arg_index + 1}; + else + error('Property ''%s'' does not exist or is ambiguous.',arg) + end + arg_index = arg_index + 2; + elseif isstruct(arg) + arg_fn = fieldnames(arg); + for i = 1:length(arg_fn) + prop_index = find(strncmpi(arg_fn{i},properties,... + length(arg_fn{i}))); + if length(prop_index) == 1 + prop.(properties{prop_index}) = arg.(arg_fn{i}); + else + error('Property ''%s'' does not exist or is ambiguous.',... + arg_fn{i}) + end + end + arg_index = arg_index + 1; + else + error(['Properties must be specified by property/value pairs',... + ' or structures.']) + end +end + +% Validate FilterSpec property. +if isempty(prop.filterspec) + prop.filterspec = '*'; +end +if ~ischar(prop.filterspec) + error('FilterSpec property must contain a string.') +end + +% Validate REFilter property. +if ~ischar(prop.refilter) + error('REFilter property must contain a string.') +end + +% Validate Prompt property. +if ~ischar(prop.prompt) + error('Prompt property must contain a string.') +end + +% Validate NumFiles property. +if numel(prop.numfiles) > 2 || any(prop.numfiles < 0) + error('NumFiles must be empty, a scalar or two-element vector.') +end +prop.numfiles = unique(prop.numfiles); +if isequal(prop.numfiles,1) + numstr = 'Select exactly 1 file.'; +elseif length(prop.numfiles) == 1 + numstr = sprintf('Select exactly %d files.',prop.numfiles); +else + numstr = sprintf('Select %d to %d files.',prop.numfiles); +end + +% Validate Output property. +legal_outputs = {'cell','struct','char'}; +out_idx = find(strncmpi(prop.output,legal_outputs,length(prop.output))); +if length(out_idx) == 1 + prop.output = legal_outputs{out_idx}; +else + error(['Value of ''Output'' property, ''%s'', is illegal or '... + 'ambiguous.'],prop.output) +end + + +% Initialize file lists. +[current_dir,f,e] = fileparts(prop.filterspec); +filter = [f,e]; +if isempty(current_dir) + current_dir = pwd; +end +if isempty(filter) + filter = '*'; +end +re_filter = prop.refilter; +full_filter = fullfile(current_dir,filter); +path_cell = path2cell(current_dir); +fdir = filtered_dir(full_filter,re_filter); +filenames = {fdir.name}'; +filenames = annotate_file_names(filenames,fdir); + +% Initialize some data. +file_picks = {}; +full_file_picks = {}; +dir_picks = dir(' '); % Create empty directory structure. +show_full_path = false; +nodupes = true; +history = {current_dir}; + +% Create figure. +gray = get(0,'DefaultUIControlBackgroundColor'); +fig = figure('Position',[0 0 740 445],... + 'Color',gray,... + 'WindowStyle','modal',... + 'Resize','off',... + 'NumberTitle','off',... + 'Name',prop.prompt,... + 'IntegerHandle','off',... + 'CloseRequestFcn',@cancel,... + 'CreateFcn',{@movegui,'center'}); + +% Create uicontrols. +uicontrol('Style','frame',... + 'Position',[255 260 110 70]) +uicontrol('Style','frame',... + 'Position',[275 135 110 100]) + +navlist = uicontrol('Style','listbox',... + 'Position',[10 10 250 320],... + 'String',filenames,... + 'Value',[],... + 'BackgroundColor','w',... + 'Callback',@clicknav,... + 'Max',2); +pickslist = uicontrol('Style','listbox',... + 'Position',[380 10 350 320],... + 'String',{},... + 'BackgroundColor','w',... + 'Callback',@clickpicks,... + 'Max',2); + +openbut = uicontrol('Style','pushbutton',... + 'Position',[270 300 80 20],... + 'String','Open',... + 'Enable','off',... + 'Callback',@open); +arrow = [2 2 2 2 2 2 2 2 1 2 2 2;... + 2 2 2 2 2 2 2 2 2 0 2 2;... + 2 2 2 2 2 2 2 2 2 2 0 2;... + 0 0 0 0 0 0 0 0 0 0 0 0;... + 2 2 2 2 2 2 2 2 2 2 0 2;... + 2 2 2 2 2 2 2 2 2 0 2 2;... + 2 2 2 2 2 2 2 2 1 2 2 2]; +arrow(arrow == 2) = NaN; +arrow_im = NaN*ones(16,76); +arrow_im(6:12,45:56) = arrow/2; +im = repmat(arrow_im,[1 1 3]); +addbut = uicontrol('Style','pushbutton',... + 'Position',[270 270 80 20],... + 'String','Add ',... + 'Enable','off',... + 'CData',im,... + 'Callback',@add); + +removebut = uicontrol('Style','pushbutton',... + 'Position',[290 205 80 20],... + 'String','Remove',... + 'Enable','off',... + 'Callback',@remove); +moveupbut = uicontrol('Style','pushbutton',... + 'Position',[290 175 80 20],... + 'String','Move Up',... + 'Enable','off',... + 'Callback',@moveup); +movedownbut = uicontrol('Style','pushbutton',... + 'Position',[290 145 80 20],... + 'String','Move Down',... + 'Enable','off',... + 'Callback',@movedown); + +uicontrol('Position',[10 380 250 16],... + 'Style','text',... + 'String','Current Directory',... + 'HorizontalAlignment','center') +dir_popup = uicontrol('Style','popupmenu',... + 'Position',[10 335 250 20],... + 'BackgroundColor','w',... + 'String',path_cell(end:-1:1),... + 'Value',1,... + 'Callback',@dirpopup); +hist_cm = uicontextmenu; +pathbox = uicontrol('Position',[10 360 250 20],... + 'Style','edit',... + 'BackgroundColor','w',... + 'String',current_dir,... + 'HorizontalAlignment','left',... + 'Callback',@change_path,... + 'UIContextMenu',hist_cm); +hist_menus = []; +hist_cb = @history_cb; +hist_menus = make_history_cm(hist_cb,hist_cm,hist_menus,history); + +uicontrol('Position',[10 425 80 16],... + 'Style','text',... + 'String','File Filter',... + 'HorizontalAlignment','left') +uicontrol('Position',[100 425 160 16],... + 'Style','text',... + 'String','Reg. Exp. Filter',... + 'HorizontalAlignment','left') +showallfiles = uicontrol('Position',[270 405 100 20],... + 'Style','checkbox',... + 'String','Show All Files',... + 'Value',0,... + 'HorizontalAlignment','left',... + 'Callback',@togglefilter); +filter_ed = uicontrol('Position',[10 405 80 20],... + 'Style','edit',... + 'BackgroundColor','w',... + 'String',filter,... + 'HorizontalAlignment','left',... + 'Callback',@setfilspec); +refilter_ed = uicontrol('Position',[100 405 160 20],... + 'Style','edit',... + 'BackgroundColor','w',... + 'String',re_filter,... + 'HorizontalAlignment','left',... + 'Callback',@setrefilter); + +viewfullpath = uicontrol('Style','checkbox',... + 'Position',[380 335 230 20],... + 'String','Show full paths',... + 'Value',show_full_path,... + 'HorizontalAlignment','left',... + 'Callback',@showfullpath); +remove_dupes = uicontrol('Style','checkbox',... + 'Position',[380 360 230 20],... + 'String','Remove duplicates (as per full path)',... + 'Value',nodupes,... + 'HorizontalAlignment','left',... + 'Callback',@removedupes); +uicontrol('Position',[380 405 350 20],... + 'Style','text',... + 'String','Selected Files',... + 'HorizontalAlignment','center') +uicontrol('Position',[280 80 80 30],'String','Done',... + 'Callback',@done); +uicontrol('Position',[280 30 80 30],'String','Cancel',... + 'Callback',@cancel); + +if ~isempty(prop.numfiles) + uicontrol('Position',[380 385 350 16],... + 'Style','text',... + 'String',numstr,... + 'ForegroundColor','r',... + 'HorizontalAlignment','center') +end + +set(fig,'HandleVisibility','off') + +uiwait(fig) + +% Compute desired output. +switch prop.output + case 'cell' + out = full_file_picks; + case 'struct' + out = dir_picks(:); + case 'char' + out = char(full_file_picks); + case 'cancel' + out = 0; +end + +% -------------------- Callback functions -------------------- + + function add(varargin) + values = get(navlist,'Value'); + for i = 1:length(values) + dir_pick = fdir(values(i)); + pick = dir_pick.name; + pick_full = fullfile(current_dir,pick); + dir_pick.name = pick_full; + if ~nodupes || ~any(strcmp(full_file_picks,pick_full)) + file_picks{end + 1} = pick; + full_file_picks{end + 1} = pick_full; + dir_picks(end + 1) = dir_pick; + end + end + if show_full_path + set(pickslist,'String',full_file_picks,'Value',[]); + else + set(pickslist,'String',file_picks,'Value',[]); + end + set([removebut,moveupbut,movedownbut],'Enable','off'); + end + + function remove(varargin) + values = get(pickslist,'Value'); + file_picks(values) = []; + full_file_picks(values) = []; + dir_picks(values) = []; + top = get(pickslist,'ListboxTop'); + num_above_top = sum(values < top); + top = top - num_above_top; + num_picks = length(file_picks); + new_value = min(min(values) - num_above_top,num_picks); + if num_picks == 0 + new_value = []; + set([removebut,moveupbut,movedownbut],'Enable','off') + end + if show_full_path + set(pickslist,'String',full_file_picks,'Value',new_value,... + 'ListboxTop',top) + else + set(pickslist,'String',file_picks,'Value',new_value,... + 'ListboxTop',top) + end + end + + function open(varargin) + values = get(navlist,'Value'); + if fdir(values).isdir + if strcmp(fdir(values).name,'.') + return + elseif strcmp(fdir(values).name,'..') + set(dir_popup,'Value',min(2,length(path_cell))) + dirpopup(); + return + end + current_dir = fullfile(current_dir,fdir(values).name); + history{end+1} = current_dir; + history = unique(history); + hist_menus = make_history_cm(hist_cb,hist_cm,hist_menus,... + history); + full_filter = fullfile(current_dir,filter); + path_cell = path2cell(current_dir); + fdir = filtered_dir(full_filter,re_filter); + filenames = {fdir.name}'; + filenames = annotate_file_names(filenames,fdir); + set(dir_popup,'String',path_cell(end:-1:1),'Value',1) + set(pathbox,'String',current_dir) + set(navlist,'ListboxTop',1,'Value',[],'String',filenames) + set(addbut,'Enable','off') + set(openbut,'Enable','off') + end + end + + function clicknav(varargin) + value = get(navlist,'Value'); + nval = length(value); + dbl_click_fcn = @add; + switch nval + case 0 + set([addbut,openbut],'Enable','off') + case 1 + set(addbut,'Enable','on'); + if fdir(value).isdir + set(openbut,'Enable','on') + dbl_click_fcn = @open; + else + set(openbut,'Enable','off') + end + otherwise + set(addbut,'Enable','on') + set(openbut,'Enable','off') + end + if strcmp(get(fig,'SelectionType'),'open') + dbl_click_fcn(); + end + end + + function clickpicks(varargin) + value = get(pickslist,'Value'); + if isempty(value) + set([removebut,moveupbut,movedownbut],'Enable','off') + else + set(removebut,'Enable','on') + if min(value) == 1 + set(moveupbut,'Enable','off') + else + set(moveupbut,'Enable','on') + end + if max(value) == length(file_picks) + set(movedownbut,'Enable','off') + else + set(movedownbut,'Enable','on') + end + end + if strcmp(get(fig,'SelectionType'),'open') + remove(); + end + end + + function dirpopup(varargin) + value = get(dir_popup,'Value'); + len = length(path_cell); + path_cell = path_cell(1:end-value+1); + if ispc && value == len + current_dir = ''; + full_filter = filter; + fdir = struct('name',getdrives,'date',datestr(now),... + 'bytes',0,'isdir',1); + else + current_dir = cell2path(path_cell); + history{end+1} = current_dir; + history = unique(history); + hist_menus = make_history_cm(hist_cb,hist_cm,hist_menus,... + history); + full_filter = fullfile(current_dir,filter); + fdir = filtered_dir(full_filter,re_filter); + end + filenames = {fdir.name}'; + filenames = annotate_file_names(filenames,fdir); + set(dir_popup,'String',path_cell(end:-1:1),'Value',1) + set(pathbox,'String',current_dir) + set(navlist,'String',filenames,'Value',[]) + set(addbut,'Enable','off') + end + + function change_path(varargin) + proposed_path = get(pathbox,'String'); + % Process any directories named '..'. + proposed_path_cell = path2cell(proposed_path); + ddots = strcmp(proposed_path_cell,'..'); + ddots(find(ddots) - 1) = true; + proposed_path_cell(ddots) = []; + proposed_path = cell2path(proposed_path_cell); + % Check for existance of directory. + if ~exist(proposed_path,'dir') + uiwait(errordlg(['Directory "',proposed_path,... + '" does not exist.'],'','modal')) + return + end + current_dir = proposed_path; + history{end+1} = current_dir; + history = unique(history); + hist_menus = make_history_cm(hist_cb,hist_cm,hist_menus,history); + full_filter = fullfile(current_dir,filter); + path_cell = path2cell(current_dir); + fdir = filtered_dir(full_filter,re_filter); + filenames = {fdir.name}'; + filenames = annotate_file_names(filenames,fdir); + set(dir_popup,'String',path_cell(end:-1:1),'Value',1) + set(pathbox,'String',current_dir) + set(navlist,'String',filenames,'Value',[]) + set(addbut,'Enable','off') + set(openbut,'Enable','off') + end + + function showfullpath(varargin) + show_full_path = get(viewfullpath,'Value'); + if show_full_path + set(pickslist,'String',full_file_picks) + else + set(pickslist,'String',file_picks) + end + end + + function removedupes(varargin) + nodupes = get(remove_dupes,'Value'); + if nodupes + num_picks = length(full_file_picks); + [unused,rev_order] = unique(full_file_picks(end:-1:1)); + order = sort(num_picks + 1 - rev_order); + full_file_picks = full_file_picks(order); + file_picks = file_picks(order); + if show_full_path + set(pickslist,'String',full_file_picks,'Value',[]) + else + set(pickslist,'String',file_picks,'Value',[]) + end + set([removebut,moveupbut,movedownbut],'Enable','off') + end + end + + function moveup(varargin) + value = get(pickslist,'Value'); + set(removebut,'Enable','on') + n = length(file_picks); + omega = 1:n; + index = zeros(1,n); + index(value - 1) = omega(value); + index(setdiff(omega,value - 1)) = omega(setdiff(omega,value)); + file_picks = file_picks(index); + full_file_picks = full_file_picks(index); + value = value - 1; + if show_full_path + set(pickslist,'String',full_file_picks,'Value',value) + else + set(pickslist,'String',file_picks,'Value',value) + end + if min(value) == 1 + set(moveupbut,'Enable','off') + end + set(movedownbut,'Enable','on') + end + + function movedown(varargin) + value = get(pickslist,'Value'); + set(removebut,'Enable','on') + n = length(file_picks); + omega = 1:n; + index = zeros(1,n); + index(value + 1) = omega(value); + index(setdiff(omega,value + 1)) = omega(setdiff(omega,value)); + file_picks = file_picks(index); + full_file_picks = full_file_picks(index); + value = value + 1; + if show_full_path + set(pickslist,'String',full_file_picks,'Value',value) + else + set(pickslist,'String',file_picks,'Value',value) + end + if max(value) == n + set(movedownbut,'Enable','off') + end + set(moveupbut,'Enable','on') + end + + function togglefilter(varargin) + value = get(showallfiles,'Value'); + if value + filter = '*'; + re_filter = ''; + set([filter_ed,refilter_ed],'Enable','off') + else + filter = get(filter_ed,'String'); + re_filter = get(refilter_ed,'String'); + set([filter_ed,refilter_ed],'Enable','on') + end + full_filter = fullfile(current_dir,filter); + fdir = filtered_dir(full_filter,re_filter); + filenames = {fdir.name}'; + filenames = annotate_file_names(filenames,fdir); + set(navlist,'String',filenames,'Value',[]) + set(addbut,'Enable','off') + end + + function setfilspec(varargin) + filter = get(filter_ed,'String'); + if isempty(filter) + filter = '*'; + set(filter_ed,'String',filter) + end + % Process file spec if a subdirectory was included. + [p,f,e] = fileparts(filter); + if ~isempty(p) + newpath = fullfile(current_dir,p,''); + set(pathbox,'String',newpath) + filter = [f,e]; + if isempty(filter) + filter = '*'; + end + set(filter_ed,'String',filter) + change_path(); + end + full_filter = fullfile(current_dir,filter); + fdir = filtered_dir(full_filter,re_filter); + filenames = {fdir.name}'; + filenames = annotate_file_names(filenames,fdir); + set(navlist,'String',filenames,'Value',[]) + set(addbut,'Enable','off') + end + + function setrefilter(varargin) + re_filter = get(refilter_ed,'String'); + fdir = filtered_dir(full_filter,re_filter); + filenames = {fdir.name}'; + filenames = annotate_file_names(filenames,fdir); + set(navlist,'String',filenames,'Value',[]) + set(addbut,'Enable','off') + end + + function done(varargin) + % Optional shortcut: click on a file and press 'Done'. +% if isempty(full_file_picks) && strcmp(get(addbut,'Enable'),'on') +% add(); +% end + numfiles = length(full_file_picks); + if ~isempty(prop.numfiles) + if numfiles < prop.numfiles(1) + msg = {'Too few files selected.',numstr}; + uiwait(errordlg(msg,'','modal')) + return + elseif numfiles > prop.numfiles(end) + msg = {'Too many files selected.',numstr}; + uiwait(errordlg(msg,'','modal')) + return + end + end + delete(fig) + end + + function cancel(varargin) + prop.output = 'cancel'; + delete(fig) + end + + function history_cb(varargin) + current_dir = history{varargin{3}}; + full_filter = fullfile(current_dir,filter); + path_cell = path2cell(current_dir); + fdir = filtered_dir(full_filter,re_filter); + filenames = {fdir.name}'; + filenames = annotate_file_names(filenames,fdir); + set(dir_popup,'String',path_cell(end:-1:1),'Value',1) + set(pathbox,'String',current_dir) + set(navlist,'ListboxTop',1,'Value',[],'String',filenames) + set(addbut,'Enable','off') + set(openbut,'Enable','off') + end +end + + +% -------------------- Subfunctions -------------------- + +function c = path2cell(p) +% Turns a path string into a cell array of path elements. +c = strread(p,'%s','delimiter','\\/'); +if ispc + c = [{'My Computer'};c]; +else + c = [{filesep};c(2:end)]; +end +end + + +function p = cell2path(c) +% Turns a cell array of path elements into a path string. +if ispc + p = fullfile(c{2:end},''); +else + p = fullfile(c{:},''); +end +end + + +function d = filtered_dir(full_filter,re_filter) +% Like dir, but applies filters and sorting. +p = fileparts(full_filter); +if isempty(p) && full_filter(1) == '/' + p = '/'; +end +if exist(full_filter,'dir') + c = cell(0,1); + dfiles = struct('name',c,'date',c,'bytes',c,'isdir',c); +else + dfiles = dir(full_filter); +end +if ~isempty(dfiles) + dfiles([dfiles.isdir]) = []; +end +ddir = dir(p); +ddir = ddir([ddir.isdir]); +% Additional regular expression filter. +if nargin > 1 && ~isempty(re_filter) + if ispc + no_match = cellfun('isempty',regexpi({dfiles.name},re_filter)); + else + no_match = cellfun('isempty',regexp({dfiles.name},re_filter)); + end + dfiles(no_match) = []; +end +% Set navigator style: +% 1 => mix file and directory names +% 2 => means list all files before all directories +% 3 => means list all directories before all files +% 4 => same as 2 except put . and .. directories first +if isunix + style = 4; +else + style = 4; +end +switch style + case 1 + d = [dfiles;ddir]; + [unused,index] = sort({d.name}); + d = d(index); + case 2 + [unused,index1] = sort({dfiles.name}); + [unused,index2] = sort({ddir.name}); + d = [dfiles(index1);ddir(index2)]; + case 3 + [unused,index1] = sort({dfiles.name}); + [unused,index2] = sort({ddir.name}); + d = [ddir(index2);dfiles(index1)]; + case 4 + [unused,index1] = sort({dfiles.name}); + dot1 = find(strcmp({ddir.name},'.')); + dot2 = find(strcmp({ddir.name},'..')); + ddot1 = ddir(dot1); + ddot2 = ddir(dot2); + ddir([dot1,dot2]) = []; + [unused,index2] = sort({ddir.name}); + d = [ddot1;ddot2;dfiles(index1);ddir(index2)]; +end +end + + +function drives = getdrives +% Returns a cell array of drive names on Windows. +letters = char('A':'Z'); +num_letters = length(letters); +drives = cell(1,num_letters); +for i = 1:num_letters + if exist([letters(i),':\'],'dir'); + drives{i} = [letters(i),':']; + end +end +drives(cellfun('isempty',drives)) = []; +end + + +function filenames = annotate_file_names(filenames,dir_listing) +% Adds a trailing filesep character to directory names. +fs = filesep; +for i = 1:length(filenames) + if dir_listing(i).isdir + filenames{i} = [filenames{i},fs]; + end +end +end + + +function hist_menus = make_history_cm(cb,hist_cm,hist_menus,history) +% Make context menu for history. +if ~isempty(hist_menus) + delete(hist_menus) +end +num_hist = length(history); +hist_menus = zeros(1,num_hist); +for i = 1:num_hist + hist_menus(i) = uimenu(hist_cm,'Label',history{i},... + 'Callback',{cb,i}); +end +end diff --git a/matlab/spatial/addPoints.m b/matlab/spatial/addPoints.m new file mode 100644 index 0000000..b9c9ddf --- /dev/null +++ b/matlab/spatial/addPoints.m @@ -0,0 +1,86 @@ +function uusiData=addPoints(data) +%Lisää koordinaatipisteiden joukkoon pisteitä, jotta jokainen datapiste +%kuuluisi äärelliseen voronoi soluun voronoi tessellaatiota +%muodostettaessa. Apupisteet lisätään muodostamalla hila +%koordinaattipisteiden päälle ja ottamalla voronoi tessellaatio hilasta. Ne +%hilan pisteet, joita vastaavien solujen sisällä ei ole yhtään +%koordinaattipistettä, jäävät apupisteiksi + +x = data(:,1); +y = data(:,2); + +xmax = max(x); +xmin = min(x); +ymax = max(y); +ymin = min(y); + +npist = size(unique(data, 'rows'),1); +nstep = ceil(npist^0.4) + 7; +xstep = (xmax-xmin)/(nstep-7); +ystep = (ymax-ymin)/(nstep-7); + +apuPisteet = zeros(nstep^2,2); + +for i=1:nstep + apuPisteet((i-1)*nstep+1 : i*nstep,1) = xmin + (i-4)*xstep; + apuPisteet((i-1)*nstep+1 : i*nstep,2) = ymin + ((1:nstep)-4)*ystep; +end + + + +[V,C] = voronoin(apuPisteet,{'Qt','Qbb','Qc','Qz'}); + +if 0 + figure + hold on + for i=1:length(C) + if isempty(find(C{i} == 1)) + X = V(C{i},:); + hull = convhull(X(:,1),X(:,2)); + plot(X(hull,1), X(hull,2)); + end + end + axis([-2 7 -2 8]); + plot(data(:,1), data(:,2), 'r*'); + plot(apuPisteet(:,1), apuPisteet(:,2), 'b+'); + + hold off +end +empty = zeros(nstep^2,1); + +for i = 1:length(C) + if isempty(find(C{i} == 1)) %Tutkitaan vain rajoitetut solut + vx = V(C{i},1); + vy = V(C{i},2); + IN = any(inpolygon(x,y,vx,vy)); + if IN == 0 + empty(i) = 1; + end + + end +end + +empty = find(empty == 1); +C = C(empty); + +apuPisteet = apuPisteet(empty, :); + +if 0 + figure + hold on + for i=1:length(C) + if isempty(find(C{i} == 1)) + X = V(C{i},:); + hull = convhull(X(:,1),X(:,2)); + plot(X(hull,1), X(hull,2)); + end + end + plot(data(:,1), data(:,2), 'r*'); + plot(apuPisteet(:,1), apuPisteet(:,2), 'b+'); + axis([-2 7 -2 8]); + hold off +end + +uusiData = [data; apuPisteet]; + + diff --git a/matlab/spatial/findCliques.m b/matlab/spatial/findCliques.m new file mode 100644 index 0000000..90effd6 --- /dev/null +++ b/matlab/spatial/findCliques.m @@ -0,0 +1,509 @@ +function [cliques, separators, G] = findCliques(M) +%Muuttaa graafin M kolmioituvaksi ja laskee siitä klikit ja +%separaattorit. +%Hyödynnetään Kevin Murphyn algoritmeja Graph Theory toolboxista. +%Päivitetty 12.8.2005 + +order=elim_order(M,ones(length(M))); +[G,cliques]=triangulate(M,order); +[jtree,root]=cliques_to_jtree(cliques,ones(length(M))); +ncliq=length(cliques); +separators=cell(ncliq-1,1); %n-solmuisessa puussa n-1 viivaa + +jono=zeros(length(ncliq)); +jono(1)=root; +i=1; +pointer=2; %Seuraava tyhjä paikka + +while ~isempty(find(jono~=0)) %Puun leveyssuuntainen läpikäynti + lapset=find(jtree(jono(i),:)~=0); + jtree(:,jono(i))=0; %Klikki käsitelty + jono(pointer:pointer+length(lapset)-1)=lapset; + for j=1:length(lapset) + ehdokas = myintersect(cliques{jono(i)},cliques{lapset(j)}); + kelpaa = 1; + for k = 1:(pointer+j-3) + % Tutkitaan, että separaattoriehdokasta ei vielä käsitelty + if isequal(ehdokas,separators{k}) + kelpaa = 0; + end + end + if kelpaa + separators{pointer+j-2} = ehdokas; + end + end + jono(i)=0; + pointer=pointer+length(lapset); + i=i+1; +end + +notEmpty=zeros(ncliq-1,1); +for i=1:ncliq-1 + if ~isempty(separators{i}) + notEmpty(i)=1; + end +end +notEmpty=find(notEmpty==1); +separators=separators(notEmpty); + + +%-------------------------------------------------------------------------- +%-------------------------------------------------------------------------- + +function order = elim_order(G, node_sizes) +% BEST_FIRST_ELIM_ORDER Greedily search for an optimal elimination order. +% order = best_first_elim_order(moral_graph, node_sizes) +% +% Find an order in which to eliminate nodes from the graph in such a way as to try and minimize the +% weight of the resulting triangulated graph. The weight of a graph is the sum of the weights of each +% of its cliques; the weight of a clique is the product of the weights of each of its members; the +% weight of a node is the number of values it can take on. +% +% Since this is an NP-hard problem, we use the following greedy heuristic: +% at each step, eliminate that node which will result in the addition of the least +% number of fill-in edges, breaking ties by choosing the node that induces the lighest clique. +% For details, see +% - Kjaerulff, "Triangulation of graphs -- algorithms giving small total state space", +% Univ. Aalborg tech report, 1990 (www.cs.auc.dk/~uk) +% - C. Huang and A. Darwiche, "Inference in Belief Networks: A procedural guide", +% Intl. J. Approx. Reasoning, 11, 1994 +% + +% Warning: This code is pretty old and could probably be made faster. + +n = length(G); +%if nargin < 3, stage = { 1:n }; end % no constraints + +% For long DBNs, it may be useful to eliminate all the nodes in slice t before slice t+1. +% This will ensure that the jtree has a repeating structure (at least away from both edges). +% This is why we have stages. +% See the discussion of splicing jtrees on p68 of +% Geoff Zweig's PhD thesis, Dept. Comp. Sci., UC Berkeley, 1998. +% This constraint can increase the clique size significantly. + +MG = G; % copy the original graph +uneliminated = ones(1,n); +order = zeros(1,n); +%t = 1; % Counts which time slice we are on +for i=1:n + U = find(uneliminated); + %valid = myintersect(U, stage{t}); + valid = U; + % Choose the best node from the set of valid candidates + min_fill = zeros(1,length(valid)); + min_weight = zeros(1,length(valid)); + for j=1:length(valid) + k = valid(j); + nbrs = myintersect(neighbors(G, k), U); + l = length(nbrs); + M = MG(nbrs,nbrs); + min_fill(j) = l^2 - sum(M(:)); % num. added edges + min_weight(j) = prod(node_sizes([k nbrs])); % weight of clique + end + lightest_nbrs = find(min_weight==min(min_weight)); + % break ties using min-fill heuristic + best_nbr_ndx = argmin(min_fill(lightest_nbrs)); + j = lightest_nbrs(best_nbr_ndx); % we will eliminate the j'th element of valid + %j1s = find(score1==min(score1)); + %j = j1s(argmin(score2(j1s))); + k = valid(j); + uneliminated(k) = 0; + order(i) = k; + ns = myintersect(neighbors(G, k), U); + if ~isempty(ns) + G(ns,ns) = 1; + G = setdiag(G,0); + end + %if ~any(logical(uneliminated(stage{t}))) % are we allowed to the next slice? + % t = t + 1; + %end +end + +%-------------------------------------------------------------------------- + +function [G, cliques, fill_ins] = triangulate(G, order) +% TRIANGULATE Ensure G is triangulated (chordal), i.e., every cycle of length > 3 has a chord. +% [G, cliques, fill_ins, cliques_containing_node] = triangulate(G, order) +% +% cliques{i} is the i'th maximal complete subgraph of the triangulated graph. +% fill_ins(i,j) = 1 iff we add a fill-in arc between i and j. +% +% To find the maximal cliques, we save each induced cluster (created by adding connecting +% neighbors) that is not a subset of any previously saved cluster. (A cluster is a complete, +% but not necessarily maximal, set of nodes.) + +MG = G; +n = length(G); +eliminated = zeros(1,n); +cliques = {}; +for i=1:n + u = order(i); + U = find(~eliminated); % uneliminated + nodes = myintersect(neighbors(G,u), U); % look up neighbors in the partially filled-in graph + nodes = myunion(nodes, u); % the clique will always contain at least u + G(nodes,nodes) = 1; % make them all connected to each other + G = setdiag(G,0); + eliminated(u) = 1; + + exclude = 0; + for c=1:length(cliques) + if mysubset(nodes,cliques{c}) % not maximal + exclude = 1; + break; + end + end + if ~exclude + cnum = length(cliques)+1; + cliques{cnum} = nodes; + end +end + +%fill_ins = sparse(triu(max(0, G - MG), 1)); +fill_ins=1; + +%-------------------------------------------------------------------------- + +function [jtree, root, B, w] = cliques_to_jtree(cliques, ns) +% MK_JTREE Make an optimal junction tree. +% [jtree, root, B, w] = mk_jtree(cliques, ns) +% +% A junction tree is a tree that satisfies the jtree property, which says: +% for each pair of cliques U,V with intersection S, all cliques on the path between U and V +% contain S. (This ensures that local propagation leads to global consistency.) +% +% We can create a junction tree by computing the maximal spanning tree of the junction graph. +% (The junction graph connects all cliques, and the weight of an edge (i,j) is +% |C(i) intersect C(j)|, where C(i) is the i'th clique.) +% +% The best jtree is the maximal spanning tree which minimizes the sum of the costs on each edge, +% where cost(i,j) = w(C(i)) + w(C(j)), and w(C) is the weight of clique C, +% which is the total number of values C can take on. +% +% For details, see +% - Jensen and Jensen, "Optimal Junction Trees", UAI 94. +% +% Input: +% cliques{i} = nodes in clique i +% ns(i) = number of values node i can take on +% Output: +% jtree(i,j) = 1 iff cliques i and j aer connected +% root = the clique that should be used as root +% B(i,j) = 1 iff node j occurs in clique i +% w(i) = weight of clique i + + + +num_cliques = length(cliques); +w = zeros(num_cliques, 1); +B = sparse(num_cliques, 1); +for i=1:num_cliques + B(i, cliques{i}) = 1; + w(i) = prod(ns(cliques{i})); +end + + +% C1(i,j) = length(intersect(cliques{i}, cliques{j})); +% The length of the intersection of two sets is the dot product of their bit vector representation. +C1 = B*B'; +C1 = setdiag(C1, 0); + +% C2(i,j) = w(i) + w(j) +num_cliques = length(w); +W = repmat(w, 1, num_cliques); +C2 = W + W'; +C2 = setdiag(C2, 0); + +jtree = sparse(minimum_spanning_tree(-C1, C2)); % Using -C1 gives *maximum* spanning tree + +% The root is arbitrary, but since the first pass is towards the root, +% we would like this to correspond to going forward in time in a DBN. +root = num_cliques; + +%-------------------------------------------------------------------------- + + +function C = myintersect(A,B) +% MYINTERSECT Intersection of two sets of positive integers (much faster than built-in intersect) +% C = myintersect(A,B) + +A = A(:)'; B = B(:)'; + +if isempty(A) + ma = 0; +else + ma = max(A); +end + +if isempty(B) + mb = 0; +else + mb = max(B); +end + +if ma==0 | mb==0 + C = []; +else + %bits = sparse(1, max(ma,mb)); + bits = zeros(1, max(ma,mb)); + bits(A) = 1; + C = B(logical(bits(B))); +end + +%sum( bitget( bitand( cliquesb(i), cliquesb(j) ), 1:52 ) ); + +%-------------------------------------------------------------------------- + +function ns = neighbors(adj_mat, i) +% NEIGHBORS Find the parents and children of a node in a graph. +% ns = neighbors(adj_mat, i) + +%ns = myunion(children(adj_mat, i), parents(adj_mat, i)); +ns = find(adj_mat(i,:)); + +%-------------------------------------------------------------------------- + +function C = myunion(A,B) +% MYUNION Union of two sets of positive integers (much faster than built-in union) +% C = myunion(A,B) + +if isempty(A) + ma = 0; +else + ma = max(A); +end + +if isempty(B) + mb = 0; +else + mb = max(B); +end + +if ma==0 & mb==0 + C = []; +elseif ma==0 & mb>0 + C = B; +elseif ma>0 & mb==0 + C = A; +else + %bits = sparse(1, max(ma,mb)); + bits = zeros(1, max(ma,mb)); + bits(A) = 1; + bits(B) = 1; + C = find(bits); +end + +%-------------------------------------------------------------------------- + + +function ps = parents(adj_mat, i) +% PARENTS Return the list of parents of node i +% ps = parents(adj_mat, i) + +ps = find(adj_mat(:,i))'; + +%-------------------------------------------------------------------------- + +function cs = children(adj_mat, i, t) +% CHILDREN Return the indices of a node's children in sorted order +% c = children(adj_mat, i, t) +% +% t is an optional argument: if present, dag is assumed to be a 2-slice DBN + +if nargin < 3 + cs = find(adj_mat(i,:)); +else + if t==1 + cs = find(adj_mat(i,:)); + else + ss = length(adj_mat)/2; + j = i+ss; + cs = find(adj_mat(j,:)) + (t-2)*ss; + end +end + +%-------------------------------------------------------------------------- + +function p=mysubset(small,large) +% MYSUBSET Is the small set of +ve integers a subset of the large set? +% p = mysubset(small, large) + +% Surprisingly, this is not built-in. + +if isempty(small) + p = 1; % isempty(large); +else + p = length(myintersect(small,large)) == length(small); +end + +%-------------------------------------------------------------------------- + +function A = minimum_spanning_tree(C1, C2) +% +% Find the minimum spanning tree using Prim's algorithm. +% C1(i,j) is the primary cost of connecting i to j. +% C2(i,j) is the (optional) secondary cost of connecting i to j, used to break ties. +% We assume that absent edges have 0 cost. +% To find the maximum spanning tree, used -1*C. +% See Aho, Hopcroft & Ullman 1983, "Data structures and algorithms", p 237. + +% Prim's is O(V^2). Kruskal's algorithm is O(E log E) and hence is more efficient +% for sparse graphs, but is implemented in terms of a priority queue. + +% We partition the nodes into those in U and those not in U. +% closest(i) is the vertex in U that is closest to i in V-U. +% lowcost(i) is the cost of the edge (i, closest(i)), or infinity is i has been used. +% In Aho, they say C(i,j) should be "some appropriate large value" if the edge is missing. +% We set it to infinity. +% However, since lowcost is initialized from C, we must distinguish absent edges from used nodes. + +n = length(C1); +if nargin==1, C2 = zeros(n); end +A = zeros(n); + +closest = ones(1,n); +used = zeros(1,n); % contains the members of U +used(1) = 1; % start with node 1 +C1(find(C1==0))=inf; +C2(find(C2==0))=inf; +lowcost1 = C1(1,:); +lowcost2 = C2(1,:); + +for i=2:n + ks = find(lowcost1==min(lowcost1)); + k = ks(argmin(lowcost2(ks))); + A(k, closest(k)) = 1; + A(closest(k), k) = 1; + lowcost1(k) = inf; + lowcost2(k) = inf; + used(k) = 1; + NU = find(used==0); + for ji=1:length(NU) + for j=NU(ji) + if C1(k,j) < lowcost1(j) + lowcost1(j) = C1(k,j); + lowcost2(j) = C2(k,j); + closest(j) = k; + end + end + end +end + +%-------------------------------------------------------------------------- + +function indices = argmin(v) +% ARGMIN Return as a subscript vector the location of the smallest element of a multidimensional array v. +% indices = argmin(v) +% +% Returns the first minimum in the case of ties. +% Example: +% X = [2 8 4; 7 3 9]; +% argmin(X) = [1 1], i.e., row 1 column 1 + +[m i] = min(v(:)); +indices = ind2subv(mysize(v), i); + +%-------------------------------------------------------------------------- + +function M = setdiag(M, v) +% SETDIAG Set the diagonal of a matrix to a specified scalar/vector. +% M = set_diag(M, v) + +n = length(M); +if length(v)==1 + v = repmat(v, 1, n); +end + +% e.g., for 3x3 matrix, elements are numbered +% 1 4 7 +% 2 5 8 +% 3 6 9 +% so diagnoal = [1 5 9] + + +J = 1:n+1:n^2; +M(J) = v; + +%------------------------------------------------------------------------- + +function sz = mysize(M) +% MYSIZE Like the built-in size, except it returns n if M is a vector of length n, and 1 if M is a scalar. +% sz = mysize(M) +% +% The behavior is best explained by examples +% - M = rand(1,1), mysize(M) = 1, size(M) = [1 1] +% - M = rand(2,1), mysize(M) = 2, size(M) = [2 1] +% - M = rand(1,2), mysize(M) = 2, size(M) = [1 2] +% - M = rand(2,2,1), mysize(M) = [2 2], size(M) = [2 2] +% - M = rand(1,2,1), mysize(M) = 2, size(M) = [1 2] + +if myisvector(M) + sz = length(M); +else + sz = size(M); +end + +%-------------------------------------------------------------------------- + +function sub = ind2subv(siz, ndx) +% IND2SUBV Like the built-in ind2sub, but returns the answer as a row vector. +% sub = ind2subv(siz, ndx) +% +% siz and ndx can be row or column vectors. +% sub will be of size length(ndx) * length(siz). +% +% Example +% ind2subv([2 2 2], 1:8) returns +% [1 1 1 +% 2 1 1 +% ... +% 2 2 2] +% That is, the leftmost digit toggle fastest. +% +% See also SUBV2IND + +n = length(siz); + +if n==0 + sub = ndx; + return; +end + +if all(siz==2) + sub = dec2bitv(ndx-1, n); + sub = sub(:,n:-1:1)+1; + return; +end + +cp = [1 cumprod(siz(:)')]; +ndx = ndx(:) - 1; +sub = zeros(length(ndx), n); +for i = n:-1:1 % i'th digit + sub(:,i) = floor(ndx/cp(i))+1; + ndx = rem(ndx,cp(i)); +end + +%%%%%%%%%% + +function bits = dec2bitv(d,n) +% DEC2BITV Convert a decimal integer to a bit vector. +% bits = dec2bitv(d,n) is just like the built-in dec2bin, except the answer is a vector, not a string. +% n is an optional minimum length on the bit vector. +% If d is a vector, each row of the output array will be a bit vector. + + +if (nargin<2) + n=1; % Need at least one digit even for 0. +end +d = d(:); + +[f,e]=log2(max(d)); % How many digits do we need to represent the numbers? +bits=rem(floor(d*pow2(1-max(n,e):0)),2); + + +%------------------------------------------------------------------------ + +function r = myisvector(V) +%Kuten isvector(V) + +A = size(V); +r = (length(A) == 2) & (min(A) == 1); diff --git a/matlab/spatial/handleIndiFastaCase.m b/matlab/spatial/handleIndiFastaCase.m new file mode 100644 index 0000000..6041906 --- /dev/null +++ b/matlab/spatial/handleIndiFastaCase.m @@ -0,0 +1,307 @@ +function handleIndiFastaCase(cc,dist,Z) +% specicially written to handle FASTA file format of individual clustering +% Lu Cheng, 15.12.2012 + +OUTPUT_FILE = 'baps6_output.baps'; + +teksti = 'Input upper bound to the number of populations (only one value): '; +npopstextExtra = inputdlg(teksti ,'Input maximum number of populations',1,{'20'}); +if isempty(npopstextExtra) % Painettu Cancel:ia + return +else + nMaxPops = str2num(npopstextExtra{1}); + nMaxPops = nMaxPops(1); +end + +initPart = cluster_own(Z,nMaxPops); + +roundTypes = [2*ones(1,nMaxPops) ... + 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ... + 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ... + 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 ... + 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 1 1 1 1 ... + 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 ... + 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 ... + 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4]; + +[partition, logml, partitionSummary, logmldiff] = model_search_parallel(cc, initPart, dist, roundTypes); +cc.PARTITION = partition; +cc.npops = length(unique(partition)); +cc.logml = logml; +cc.partitionSummary = partitionSummary; +cc.logmldiff = logmldiff; + +if cc.npops==nMaxPops + choice = questdlg(sprintf('%d populations discovered, which is the same as input. We suggest you to set a larger number. Do you want to quit?', cc.npops),... + 'Yes''No','Yes'); + if strcmp(choice,'Yes') + return + end +end + +writeMixtureInfo(cc); + +popnames = cc.popnames; +pointers = cc.pointers; +vorPoints = cc.vorPoints; +vorCells = cc.vorCells; +coordinates = cc.coordinates; +heds = cc.heds; + +viewMixPartition(partition, popnames); + +if isequal(popnames, []) + names = pointers; +else + names = cell(size(pointers)); + indices = zeros(size(popnames(:,2))); + for i=1:length(popnames(:,2)); + indices(i) = popnames{i,2}; + end + for i = 1:length(pointers) + inds = pointers{i}; + namesInCell = []; + for j = 1:length(inds) + ind = inds(j); + I = find(indices > ind); + if isempty(I) + nameIndex = length(indices); + else + nameIndex = min(I) -1; + end + name = popnames{nameIndex}; + namesInCell = [namesInCell name]; + end + names{i} = namesInCell; + end +end +vorPlot(vorPoints, vorCells, partition, pointers, coordinates, names); + +talle = questdlg(['Do you want to save the mixture populations ' ... + 'so that you can use them later in admixture analysis or plot ' ... + 'additional images?'], ... + 'Save results?','Yes','No','Yes'); +if isequal(talle,'Yes') + %%waitALittle; % Hetki odotusta, jotta muistaa kysy?.. + [filename, pathname] = uiputfile('*.mat','Save results as'); + + if (filename == 0) & (pathname == 0) + % Cancel was pressed + return + else % copy 'baps4_output.baps' into the text file with the same name. + if exist(OUTPUT_FILE,'file') + copyfile(OUTPUT_FILE,[pathname filename '.txt']) + delete(OUTPUT_FILE) + end + end + + % added by Lu Cheng, 05.12.2012 + tmpFile = [pathname filename '.mapfile.txt']; + fid = fopen(tmpFile,'w+'); + fprintf(fid,'Name\tLatitude\tLongitude\tDescription\tLabel\n'); + if exist('heds','var') + for i=1:length(heds) + fprintf(fid,'%s\t%.10f\t%.10f\t%s_%d\t%d\n',heds{i},coordinates(i,1),coordinates(i,2),... + heds{i},partition(i),partition(i)); + end + else + for i=1:ninds + fprintf(fid,'%d\t%.10f\t%.10f\t%d_%d\t%d\n',i,coordinates(i,1),coordinates(i,2),... + i,partition(i),partition(i)); + end + end + fclose(fid); + +% save([pathname filename], 'c'); + format_type = 'FASTA'; + save([pathname filename], 'cc','dist','Z','format_type','-v7.3'); +else + if exist(OUTPUT_FILE,'file') + delete(OUTPUT_FILE) + end +end + + + +%%%%%%%%%%%%% +function writeMixtureInfo(c) + +outputFile = 'baps6_output.baps'; + +% output the semi-supervised clustering results to the outputFile +% modified by Lu Cheng, 28.03.2010 + +ninds = length(c.PARTITION); +npops = c.npops; +popnames = c.popnames; +logml = c.logml; +partition = c.PARTITION; +partitionSummary = c.partitionSummary; + +if isempty(popnames) + popnames = c.heds; +end + +if ~isempty(outputFile) + fid = fopen(outputFile,'w+'); +else + fid = -1; + %diary('baps5_semi_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:'); +disp(['Number of clustered individuals: ' ownNum2Str(ninds)]); +disp(['Number of groups in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%10s\n', ['RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:']); + fprintf(fid,'%20s\n', ['Number of clustered individuals: ' ownNum2Str(ninds)]); + fprintf(fid,'%20s\n', ['Number of groups in optimal partition: ' ownNum2Str(npops)]); + fprintf(fid,'%20s\n\n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +end + +disp('Best Partition: '); +if (fid ~= -1) + fprintf(fid,'%s \n','Best Partition: '); +end +for m=1:npops + indsInM = find(partition==m); + + if isempty(indsInM) + continue; + end + + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + text = [text '}']; + + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',new_line); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',text); + end + end; +end + +names = true; + +logmldiff = c.logmldiff; +if npops == 1 + logmldiff = []; +else + disp(' '); + disp(' '); + disp('Changes in log(marginal likelihood) if indvidual i is moved to cluster j:'); + + if (fid ~= -1) + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', 'Changes in log(marginal likelihood) if indvidual i is moved to cluster j:'); fprintf(fid, '\n'); + end + + text = sprintf('%10s','ind'); + for ii = 1:npops + tmpstr = sprintf('\t%10s',num2str(ii)); + text = [text tmpstr]; + end + + disp(text); + if (fid ~= -1) + fprintf(fid, '%s \n', text); + end + + for ii = 1:ninds + text = sprintf('%10s',popnames{ii}); + for jj = 1:npops + tmpstr = sprintf('\t%10s',num2str(logmldiff(ii,jj),'%10.6f')); + text = [text tmpstr]; + end + + if ii<100 + disp(text); + elseif ii==101 + disp('.......................................'); + disp('..........see output file..............'); + end + if (fid ~= -1) + fprintf(fid, '%s \n', text); + end + text = []; + end +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n\n', ' '); + fprintf(fid, '%s \n', 'List of sizes of 10 best visited partitions and corresponding log(ml) values'); fprintf(fid, '\n'); +end + +partitionSummaryKaikki = partitionSummary; +partitionSummary =[]; +for i=1:size(partitionSummaryKaikki,3) + partitionSummary = [partitionSummary; partitionSummaryKaikki(:,:,i)]; +end +% [I,J] = find(partitionSummaryKaikki(:,2,:)>-1e49); +% partitionSummaryKaikki = partitionSummaryKaikki(I,:,:); + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +partitionSummary = partitionSummary(logical(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1),'%20d') ' ' num2str(partitionSummary(part,2),'%20.6f')]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', line); + end +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + + +%%%%%%%%%% + + +%-------------------------------------------------------------- +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +% newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) && n ind); + if isempty(I) + nameIndex = length(indices); + else + nameIndex = min(I) -1; + end + name = popnames{nameIndex}; + namesInCell = [namesInCell name]; + end + names{i} = namesInCell; + end +end +viewMixPartition(partition, popnames); +vorPlot(vorPoints, vorCells, partition, pointers, coordinates, names); + +talle = questdlg(['Do you want to save the mixture populations ' ... + 'so that you can use them later in admixture analysis or plot ' ... + 'additional images?'], ... + 'Save results?','Yes','No','Yes'); +if isequal(talle,'Yes') + %%waitALittle; % Hetki odotusta, jotta muistaa kysy?.. + [filename, pathname] = uiputfile('*.mat','Save results as'); + + if (filename == 0) & (pathname == 0) + % Cancel was pressed + return + else % copy 'baps4_output.baps' into the text file with the same name. + if exist(OUTPUT_FILE,'file') + copyfile(OUTPUT_FILE,[pathname filename '.txt']) + delete(OUTPUT_FILE) + end + end + + % added by Lu Cheng, 05.12.2012 + tmpFile = [pathname filename '.mapfile.txt']; + fid = fopen(tmpFile,'w+'); + fprintf(fid,'GroupLabel\tLatitude\tLongitude\tDescription\tLabel\n'); + for i=1:nPregroup + fprintf(fid,'%d\t%.10f\t%.10f\t%d_%d\t%d\n',i,coordinates(i,1),coordinates(i,2),... + i,partition(i),partition(i)); + end + fclose(fid); + +% save([pathname filename], 'c'); + format_type = 'FASTA'; + save([pathname filename], 'cc','partition','pgDist','pgPart','format_type','-v7.3'); +else + if exist(OUTPUT_FILE,'file') + delete(OUTPUT_FILE) + end +end + + + +%%%%%%%%%%%%% +function writeMixtureInfo(c) + +outputFile = 'baps6_output.baps'; + +% output the semi-supervised clustering results to the outputFile +% modified by Lu Cheng, 28.03.2010 + +ninds = length(c.PARTITION); +npops = c.npops; +popnames = c.popnames; +logml = c.logml; +partition = c.PARTITION; +partitionSummary = c.partitionSummary; + +if isempty(popnames) + popnames = cell(c.nPregroup,1); + for i=1:c.nPregroup + popnames{i} = num2str(i); + end +end + +if ~isempty(outputFile) + fid = fopen(outputFile,'w+'); +else + fid = -1; + %diary('baps5_semi_output.baps'); % save in text anyway. +end + +dispLine; +disp('RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:'); +disp(['Number of clustered individuals: ' ownNum2Str(ninds)]); +disp(['Number of groups in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%10s\n', ['RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:']); + fprintf(fid,'%20s\n', ['Number of clustered individuals: ' ownNum2Str(ninds)]); + fprintf(fid,'%20s\n', ['Number of groups in optimal partition: ' ownNum2Str(npops)]); + fprintf(fid,'%20s\n\n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +end + +disp('Best Partition: '); +if (fid ~= -1) + fprintf(fid,'%s \n','Best Partition: '); +end +for m=1:npops + indsInM = unique(c.groupPartition(partition==m)); + + if isempty(indsInM) + continue; + end + + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + text = [text '}']; + + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',new_line); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',text); + end + end; +end + +names = true; + +logmldiff = c.logmldiff; +if npops == 1 + logmldiff = []; +else + disp(' '); + disp(' '); + disp('Changes in log(marginal likelihood) if pregroup i is moved to cluster j:'); + + if (fid ~= -1) + fprintf(fid, '%s \n', ' '); fprintf(fid, '\n'); + fprintf(fid, '%s \n', 'Changes in log(marginal likelihood) if indvidual i is moved to cluster j:'); fprintf(fid, '\n'); + end + + text = sprintf('%10s','ind'); + for ii = 1:npops + tmpstr = sprintf('\t%10s',num2str(ii)); + text = [text tmpstr]; + end + + disp(text); + if (fid ~= -1) + fprintf(fid, '%s \n', text); + end + + for ii = 1:c.nPregroup + text = sprintf('%10s',popnames{ii}); + for jj = 1:npops + tmpstr = sprintf('\t%10s',num2str(logmldiff(ii,jj),'%10.6f')); + text = [text tmpstr]; + end + + if ii<100 + disp(text); + elseif ii==101 + disp('.......................................'); + disp('..........see output file..............'); + end + if (fid ~= -1) + fprintf(fid, '%s \n', text); + end + text = []; + end +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n\n', ' '); + fprintf(fid, '%s \n', 'List of sizes of 10 best visited partitions and corresponding log(ml) values'); fprintf(fid, '\n'); +end + +partitionSummaryKaikki = partitionSummary; +partitionSummary =[]; +for i=1:size(partitionSummaryKaikki,3) + partitionSummary = [partitionSummary; partitionSummaryKaikki(:,:,i)]; +end +% [I,J] = find(partitionSummaryKaikki(:,2,:)>-1e49); +% partitionSummaryKaikki = partitionSummaryKaikki(I,:,:); + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +partitionSummary = partitionSummary(logical(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1),'%20d') ' ' num2str(partitionSummary(part,2),'%20.6f')]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', line); + end +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + + +%%%%%%%%%% + + +%-------------------------------------------------------------- +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +% newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) && n1e-5) + % Tapahtui muutos + muutoksia = 1; + kivaluku = kivaluku+1; + updateGlobalVariables(ind, i2, rowsFromInd, diffInCounts,... + adjprior, priorTerm); + logml = logml+maxMuutos; + + end + end + + elseif round==2 %Populaation yhdistäminen toiseen. + maxMuutos = 0; + for pop = 1:npops + [muutokset, diffInCounts] = laskeMuutokset2(pop, rowsFromInd, ... + data, adjprior, priorTerm); + [isoin, indeksi] = max(muutokset); + if isoin>maxMuutos + maxMuutos = isoin; + i1 = pop; + i2 = indeksi; + diffInCountsBest = diffInCounts; + end + end + + if maxMuutos>1e-5 + muutoksia = 1; + updateGlobalVariables2(i1,i2,rowsFromInd, diffInCountsBest, ... + adjprior, priorTerm); + logml = logml + maxMuutos; + end + + + elseif round==3 | round==4 %Populaation jakaminen osiin. + maxMuutos = 0; + ninds = size(data,1)/rowsFromInd; + for pop = 1:npops + inds2 = find(PARTITION_IN==pop); + ninds2 = length(inds2); + if ninds2>5 + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = linkage(dist2'); + if round==3 + npops2 = min(20, floor(ninds2 / 5)); %Moneenko osaan jaetaan + elseif round==4 + npops2 = 2; + end + T2 = cluster_own(Z2, npops2); + muutokset = laskeMuutokset3(T2, inds2, rowsFromInd, data, ... + adjprior, priorTerm, pop); + [isoin, indeksi] = max(muutokset(1:end)); + if isoin>maxMuutos + maxMuutos = isoin; + muuttuvaPop2 = rem(indeksi,npops2); + if muuttuvaPop2==0, muuttuvaPop2 = npops2; end + muuttuvat = inds2(find(T2==muuttuvaPop2)); + i2 = ceil(indeksi/npops2); + end + end + end + if maxMuutos>1e-5 + muutoksia = 1; + rows = computeRows(rowsFromInd, muuttuvat, length(muuttuvat)); + diffInCounts = computeDiffInCounts(rows, size(COUNTS_IN,1), ... + size(COUNTS_IN,2), data); + i1 = PARTITION_IN(muuttuvat(1)); + updateGlobalVariables3(muuttuvat, rowsFromInd, diffInCounts, ... + adjprior, priorTerm, i2); + logml = logml + maxMuutos; + + end + + elseif round == 5 | round == 6 + pop=0; + muutettu = 0; + poplogml = POP_LOGML_IN; + partition = PARTITION_IN; + counts = COUNTS_IN; + sumcounts = SUMCOUNTS_IN; + + while (pop < npops & muutettu == 0) + pop = pop+1; + totalMuutos = 0; + inds = find(PARTITION_IN==pop); + if round == 5 + aputaulu = [inds rand(length(inds),1)]; + aputaulu = sortrows(aputaulu,2); + inds = aputaulu(:,1)'; + elseif round == 6 + inds = returnInOrder(inds, pop, rowsFromInd, data, adjprior, priorTerm); + end + + i=0; + + while (length(inds)>0 & i 1e-5 + i=length(inds); + end + end + end + + if totalMuutos>1e-5 + muutettu=1; + muutoksia = 1; % Ulompi kirjanpito. + else + % Missään vaiheessa tila ei parantunut. + % Perutaan kaikki muutokset. + PARTITION_IN = partition; + SUMCOUNTS_IN = sumcounts; + POP_LOGML_IN = poplogml; + COUNTS_IN = counts; + logml = logml - totalMuutos; + end + end + clear partition; clear sumcounts; clear counts; clear poplogml; + end + end + + + if muutoksia == 0 + if vaihe==1 + vaihe = 2; + elseif vaihe==2 + vaihe = 3; + elseif vaihe==3 + vaihe = 4; + elseif vaihe==4; + vaihe = 5; + elseif vaihe==5 + ready = 1; + end + else + muutoksia = 0; + end + + if ready==0 + if vaihe==1 + roundTypes=[1]; + elseif vaihe==2 + roundTypes = [2]; + elseif vaihe==3 + roundTypes=[5]; + elseif vaihe==4 + roundTypes=[4 3 1]; + elseif vaihe + roundTypes=[6 2 3 4 1]; + end + end +end + +partition = PARTITION_IN; +counts = COUNTS_IN; +sumcounts = SUMCOUNTS_IN; + + + +%------------------------------------------------------------------------------------- + +function [sumcounts, counts, logml] = ... + initialCounts(partition, data, npops, rowsFromInd, noalle) + +nloci=size(data,2); +ninds = size(data,1)/rowsFromInd; + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +initializeGammaln(ninds, rowsFromInd, max(noalle)); + +logml = computeLogml(counts, sumcounts, noalle, data, rowsFromInd); + + +%----------------------------------------------------------------------- + + +function logml=computeLogml(counts, sumcounts, noalle, data, rowsFromInd) +nloci = size(counts,2); +npops = size(counts,3); +adjnoalle = zeros(max(noalle),nloci); +for j=1:nloci + adjnoalle(1:noalle(j),j)=noalle(j); + if (noalle(j)0 + rows = computeRows(rowsFromInd, inds, ninds); + diffInCounts = computeDiffInCounts(rows, size(COUNTS_IN,1), size(COUNTS_IN,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS_IN(:,:,i1) = COUNTS_IN(:,:,i1)-diffInCounts; + SUMCOUNTS_IN(i1,:) = SUMCOUNTS_IN(i1,:)-diffInSumCounts; + new_i1_logml = computePopulationLogml(i1, adjprior, priorTerm); + COUNTS_IN(:,:,i1) = COUNTS_IN(:,:,i1)+diffInCounts; + SUMCOUNTS_IN(i1,:) = SUMCOUNTS_IN(i1,:)+diffInSumCounts; + + i2 = [1:i1-1 , i1+1:npops]; + i2_logml = POP_LOGML_IN(i2)'; + + COUNTS_IN(:,:,i2) = COUNTS_IN(:,:,i2)+repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS_IN(i2,:) = SUMCOUNTS_IN(i2,:)+repmat(diffInSumCounts,[npops-1 1]); + new_i2_logml = computePopulationLogml(i2, adjprior, priorTerm)'; + COUNTS_IN(:,:,i2) = COUNTS_IN(:,:,i2)-repmat(diffInCounts, [1 1 npops-1]); + SUMCOUNTS_IN(i2,:) = SUMCOUNTS_IN(i2,:)-repmat(diffInSumCounts,[npops-1 1]); + + muutokset(pop2,i2) = new_i1_logml - i1_logml ... + + new_i2_logml - i2_logml; + end +end + + +%------------------------------------------------------------------------------------ + +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS_IN:issa), jotka ovat data:n +% riveillä rows. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + + + +%------------------------------------------------------------------------------------ + + +function popLogml = computePopulationLogml(pops, adjprior, priorTerm) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global COUNTS_IN; +global SUMCOUNTS_IN; +x = size(COUNTS_IN,1); +y = size(COUNTS_IN,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS_IN(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS_IN(pops,:)),2) - priorTerm; + + +%---------------------------------------------------------------------------- + + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäärä. + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + + +%---------------------------------------------------------------------------------------- + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end \ No newline at end of file diff --git a/matlab/spatial/initSpatialMultiMixture.m b/matlab/spatial/initSpatialMultiMixture.m new file mode 100644 index 0000000..4916db1 --- /dev/null +++ b/matlab/spatial/initSpatialMultiMixture.m @@ -0,0 +1,18 @@ +function [partition, counts, sumcounts] = initSpatialMultiMixture(initData, ... + npops, Z, rows, noalle, dist, adjprior, priorTerm, fixedK); +% Etsii spatial multimixturelle alkutilan baps 3.1:n ahneella algoritmilla. +% toimii! + +global PARTITION; global COUNTS; +global SUMCOUNTS; global POP_LOGML; + +c.data = initData; c.Z = Z; c.rows=rows; c.rowsFromInd=0; c.noalle=noalle; +c.dist = dist; c.adjprior = adjprior; c.priorTerm = priorTerm; + +indMix_fixK(c,npops,1,0); + +partition = PARTITION; counts = COUNTS; sumcounts = SUMCOUNTS; + + + + diff --git a/matlab/spatial/private/calcLogmlChanges.m b/matlab/spatial/private/calcLogmlChanges.m new file mode 100644 index 0000000..22f4eb8 --- /dev/null +++ b/matlab/spatial/private/calcLogmlChanges.m @@ -0,0 +1,82 @@ +function changes = calcLogmlChanges(inds, cqData, nCqCodes, spData, nSpCodes, locCliques, locSeparators, logml) +% compute the logml change if the given inds are moved to another cluster +% the input inds are supposed to come from the same cluster +% changes is a npops*1 vector +% Lu Cheng, 15.12.2012 + +global CQ_COUNTS; global SUM_CQ_COUNTS; +global SP_COUNTS; global SUM_SP_COUNTS; +global PARTITION; + +global LOC_CQ_COUNTS; +global LOC_SP_COUNTS; + +npops = size(CQ_COUNTS,3); +changes = zeros(npops,1); +indsToBeMoved = inds; + +if isempty(indsToBeMoved), return, end + +i1 = PARTITION(indsToBeMoved(1)); +[diffCqCounts diffCqSumCounts]= computeDiffInCounts(indsToBeMoved, cqData, nCqCodes); +[diffSpCounts diffSpSumCounts]= computeDiffInCounts(indsToBeMoved, spData, nSpCodes); + +diffLocCqCounts = computeDiffInCliqCounts(locCliques, indsToBeMoved); +diffLocSpCounts = computeDiffInCliqCounts(locSeparators, indsToBeMoved); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1) - diffCqCounts; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1) - diffSpCounts; + +SUM_CQ_COUNTS(:,i1) = SUM_CQ_COUNTS(:,i1) - diffCqSumCounts; +SUM_SP_COUNTS(:,i1) = SUM_SP_COUNTS(:,i1) - diffSpSumCounts; + +LOC_CQ_COUNTS(:,i1) = LOC_CQ_COUNTS(:,i1) - diffLocCqCounts; +LOC_SP_COUNTS(:,i1) = LOC_SP_COUNTS(:,i1) - diffLocSpCounts; + +% PARTITION(inds) = -1; +updateLogmlTable(i1); + +for i2 = 1:npops + if i2 ~= i1 + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2) + diffCqCounts; + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2) + diffSpCounts; + + SUM_CQ_COUNTS(:,i2) = SUM_CQ_COUNTS(:,i2) + diffCqSumCounts; + SUM_SP_COUNTS(:,i2) = SUM_SP_COUNTS(:,i2) + diffSpSumCounts; + + LOC_CQ_COUNTS(:,i2) = LOC_CQ_COUNTS(:,i2) + diffLocCqCounts; + LOC_SP_COUNTS(:,i2) = LOC_SP_COUNTS(:,i2) + diffLocSpCounts; + +% PARTITION(inds) = i2; + updateLogmlTable(i2); + logml_new = computeTotalLogml(); + changes(i2) = logml_new - logml; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2) - diffCqCounts; + SUM_CQ_COUNTS(:,i2) = SUM_CQ_COUNTS(:,i2) - diffCqSumCounts; + + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2) - diffSpCounts; + SUM_SP_COUNTS(:,i2) = SUM_SP_COUNTS(:,i2) - diffSpSumCounts; + + LOC_CQ_COUNTS(:,i2) = LOC_CQ_COUNTS(:,i2) - diffLocCqCounts; + LOC_SP_COUNTS(:,i2) = LOC_SP_COUNTS(:,i2) - diffLocSpCounts; + +% PARTITION(inds) = -1; + updateLogmlTable(i2); + end +end + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1) + diffCqCounts; +SUM_CQ_COUNTS(:,i1) = SUM_CQ_COUNTS(:,i1) + diffCqSumCounts; + +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1) + diffSpCounts; +SUM_SP_COUNTS(:,i1) = SUM_SP_COUNTS(:,i1) + diffSpSumCounts; + +LOC_CQ_COUNTS(:,i1) = LOC_CQ_COUNTS(:,i1) + diffLocCqCounts; +LOC_SP_COUNTS(:,i1) = LOC_SP_COUNTS(:,i1) + diffLocSpCounts; + +% PARTITION(inds) = i1; +updateLogmlTable(i1); + + +%--------------------------------------------------------------------- \ No newline at end of file diff --git a/matlab/spatial/private/clearGlobalVars.m b/matlab/spatial/private/clearGlobalVars.m new file mode 100644 index 0000000..a6fd932 --- /dev/null +++ b/matlab/spatial/private/clearGlobalVars.m @@ -0,0 +1,18 @@ +function clearGlobalVars +% Lu Cheng, 15.12.2012 + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +global LOGML_TABLE; LOGML_TABLE = []; +global ADDITION_DIFFERENCE; ADDITION_DIFFERENCE = []; +global REMOVAL_DIFFERENCE; REMOVAL_DIFFERENCE = []; +global JOIN_DIFFERENCE; JOIN_DIFFERENCE = []; +global CQ_COUNTS; CQ_COUNTS = []; +global SP_COUNTS; SP_COUNTS = []; +global SUM_CQ_COUNTS; SUM_CQ_COUNTS = []; +global SUM_SP_COUNTS; SUM_SP_COUNTS = []; +global CQ_PRIOR; CQ_PRIOR = []; +global SP_PRIOR; SP_PRIOR = []; +global LOC_SP_COUNTS; LOC_SP_COUNTS = []; +global LOC_CQ_COUNTS; LOC_CQ_COUNTS = []; \ No newline at end of file diff --git a/matlab/spatial/private/cluster_own.m b/matlab/spatial/private/cluster_own.m new file mode 100644 index 0000000..6d9dab6 --- /dev/null +++ b/matlab/spatial/private/cluster_own.m @@ -0,0 +1,52 @@ +function T = cluster_own(Z,nclust) +% search down the dendogram from the root, until nclust clusters are found +% comments added by Lu Cheng +% 04.01.2011 + +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + +% maximum number of clusters based on inconsistency +if m <= maxclust + T = (1:m)'; +elseif maxclust==1 + T = ones(m,1); +else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end +end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + diff --git a/matlab/spatial/private/computeCounts.m b/matlab/spatial/private/computeCounts.m new file mode 100644 index 0000000..a03967f --- /dev/null +++ b/matlab/spatial/private/computeCounts.m @@ -0,0 +1,27 @@ +function [cliqcounts, sepcounts] = computeCounts(cliques, separators, npops) + +global PARTITION; +ncliq = size(cliques,1); +nsep = size(separators,1); + +cliqPartition = zeros(ncliq, size(cliques,2)); +sepPartition = zeros(nsep, size(separators, 2)); + +apuCliq = find(cliques > 0); +apuSep = find(separators > 0); + +cliqPartition(apuCliq) = PARTITION(cliques(apuCliq)); +sepPartition(apuSep) = PARTITION(separators(apuSep)); + + +cliqcounts = zeros(ncliq, npops); +for i = 1:npops + cliqcounts(:,i) = sum(cliqPartition == i, 2); +end + +sepcounts = zeros(nsep, npops); +for i = 1:npops + sepcounts(:,i) = sum(sepPartition == i, 2); +end + +%------------------------------------------------------------------------- \ No newline at end of file diff --git a/matlab/spatial/private/computeDiffInCliqCounts.m b/matlab/spatial/private/computeDiffInCliqCounts.m new file mode 100644 index 0000000..1a99b6c --- /dev/null +++ b/matlab/spatial/private/computeDiffInCliqCounts.m @@ -0,0 +1,19 @@ +function diffInCliqCounts = computeDiffInCliqCounts(cliques, inds) +% Laskee muutoksen CLIQCOUNTS:ssa (tai SEPCOUNTS:ssa, jos syötteen? +% separators) kun yksilöt inds siirretään. +% diffInCliqcounts on ncliq*1 taulu, joka on CLIQCOUNTS:n sarakkeesta josta +% yksilöt inds siirretään ja lisättäv?sarakkeeseen, johon yksilöt +% siirretään. + +% taken from spatial model of Jukka Siren's code +% Lu Cheng +% 15.12.2012 + +ncliq = size(cliques,1); +diffInCliqCounts = zeros(ncliq,1); +ninds = length(inds); +for i = 1:ninds + ind = inds(i); + rivit = sum((cliques == ind),2); + diffInCliqCounts = diffInCliqCounts + rivit; +end \ No newline at end of file diff --git a/matlab/spatial/private/computeDiffInCounts.m b/matlab/spatial/private/computeDiffInCounts.m new file mode 100644 index 0000000..df807fa --- /dev/null +++ b/matlab/spatial/private/computeDiffInCounts.m @@ -0,0 +1,7 @@ +function [counts sumcounts] = computeDiffInCounts(rows, data, nLetters) +% calculate the counts of the given rows of the data (ninds*nLoci) +% nLetters is the maximum number of different symbols over all loci +% Lu Cheng, 25.05.2011 + +counts = histc(data(rows,:),1:nLetters,1); +sumcounts = sum(counts,1)'; diff --git a/matlab/spatial/private/computeTotalLogml.m b/matlab/spatial/private/computeTotalLogml.m new file mode 100644 index 0000000..d12588e --- /dev/null +++ b/matlab/spatial/private/computeTotalLogml.m @@ -0,0 +1,42 @@ +function logml = computeTotalLogml +% compute the log marginal likelihood of the data +% Lu Cheng, 15.12.2012 + +global LOGML_TABLE; +global LOC_CQ_COUNTS; +global LOC_SP_COUNTS; + + +notEmpty = any(LOC_CQ_COUNTS,1); +npops = length(find(notEmpty == 1)); + +% the following codes added by Lu Cheng, 15.12.2012 +% some lines might all be zero if some sequence is deleted +tmpIndsCq = find(any(LOC_CQ_COUNTS,2)); +tmpIndsSp = find(any(LOC_SP_COUNTS,2)); + +locCqCounts = LOC_CQ_COUNTS(tmpIndsCq,notEmpty); +locSpCounts = LOC_SP_COUNTS(tmpIndsSp,notEmpty); + +sumcliq=sum(locCqCounts, 2); +sumsep=sum(locSpCounts, 2); + +ncliq = length(tmpIndsCq); +nsep = length(tmpIndsSp); +cliqsizes = sum(locCqCounts, 2)'; +sepsizes = sum(locSpCounts, 2)'; +cliqsizes = min([cliqsizes; npops*ones(1,ncliq)])'; +sepsizes = min([sepsizes; npops*ones(1,nsep)])'; + +klikkitn = sum(sum(gammaln(locCqCounts + repmat(1./cliqsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./cliqsizes))) ... + - sum(gammaln(sumcliq + 1)); + +septn = sum(sum(gammaln(locSpCounts + repmat(1./sepsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./sepsizes))) ... + - sum(gammaln(sumsep + 1)); + +spatialPrior = (klikkitn - septn); + + +logml = sum(LOGML_TABLE) + spatialPrior; \ No newline at end of file diff --git a/matlab/spatial/private/initialCounts2.m b/matlab/spatial/private/initialCounts2.m new file mode 100644 index 0000000..03d80cf --- /dev/null +++ b/matlab/spatial/private/initialCounts2.m @@ -0,0 +1,17 @@ +function [sumcounts, counts] = initialCounts2(partition, data, npops, nLetters) +% initialize counts and sumcounts for the initial partition +% npops: number of populations in the partition +% nLetters: the maximum number of different symbols over all loci +% Lu Cheng, 25.05.2011 + +[nSeq nLoci] = size(data); + +counts = zeros(nLetters,nLoci,npops); +sumcounts = zeros(nLoci,npops); + +for i=1:npops + inds = (partition==i); + counts(:,:,i) = histc(data(inds,:),1:nLetters,1); + sumcounts(:,i) = sum(counts(:,:,i),1); +end + diff --git a/matlab/spatial/private/model_search_parallel.m b/matlab/spatial/private/model_search_parallel.m new file mode 100644 index 0000000..14304a3 --- /dev/null +++ b/matlab/spatial/private/model_search_parallel.m @@ -0,0 +1,366 @@ +function [partition, logml, partitionSummary, logmldiff] = model_search_parallel(c, partition, orig_dist, roundTypes) +% This function clusters DNA alignment using "codon" model in Corander and Tang's +% paper: Bayesian analysis of population structure based on linked +% molecular information (2007), Mathematical Biosciences +% c: preprocessed data for the sequence alignment +% partition: initial partition of the individuals +% origdist: hamming distance between individuals, indexed by +% (1,2)(1,3)(14)...(2,3)(2,4).....(3,4)...(n-1,n) +% roundTypes: array of operation types + +% Lu Cheng +% 15.12.2012 + +interactive = false; + +global PARTITION; +global CQ_COUNTS;global SUM_CQ_COUNTS; +global SP_COUNTS;global SUM_SP_COUNTS; +global CQ_PRIOR; global SP_PRIOR; +global LOGML_TABLE; +global ADDITION_DIFFERENCE; +global REMOVAL_DIFFERENCE; +global JOIN_DIFFERENCE; + +global LOC_SP_COUNTS; +global LOC_CQ_COUNTS; + +clearGlobalVars; + +nPOPS = length(unique(partition)); + +% PRIOR VALUES: +CQ_PRIOR = c.cqPrior; +SP_PRIOR = c.spPrior; + +% Initialize PARTITION, **_COUNTS, SUM_**_COUNTS, alnMat +[sumCqCounts, cqCounts] = initialCounts2(partition, c.cqData, nPOPS, c.nMaxCqCodes); +[sumSpCounts, spCounts] = initialCounts2(partition, c.spData, nPOPS, c.nMaxSpCodes); + +CQ_COUNTS = cqCounts; SUM_CQ_COUNTS = sumCqCounts; +SP_COUNTS = spCounts; SUM_SP_COUNTS = sumSpCounts; + +PARTITION = partition; +[cliqcounts, sepcounts] = computeCounts(c.locCliques, c.locSeparators, nPOPS); + +LOC_CQ_COUNTS = cliqcounts; +LOC_SP_COUNTS = sepcounts; + +% alnMat = c.alnMat; +partitionSummary = -Inf*ones(30,2,nPOPS); % Tiedot 30 parhaasta partitiosta (npops ja logml) +partitionSummary(:,1,:) = zeros(30,1,nPOPS); +worstLogml = -Inf*ones(1, nPOPS); worstIndex = ones(1, nPOPS); + +clear partition cqCounts sumCqCounts spCounts sumSpCounts + +% Initialize LOGML_TABLE: +nINDS = c.nSeq; +LOGML_TABLE = zeros(nPOPS,1); +updateLogmlTable(1:nPOPS); + +REMOVAL_DIFFERENCE = zeros(nINDS,1); +REMOVAL_DIFFERENCE(:,:) = nan; +ADDITION_DIFFERENCE = zeros(nINDS,nPOPS); +ADDITION_DIFFERENCE(:,:) = nan; +JOIN_DIFFERENCE = zeros(nPOPS, nPOPS); +JOIN_DIFFERENCE(:,:) = nan; + +% ***********Doc:******************** +% REMOVAL_DIFFERENCE(ind) tells the change in logml if ind is removed from +% its cluster. nan, if the cluster has changed, since the value was last +% calculated. +% +% ADDITION_DIFFERENCE(ind, pop) tells the change in logml if ind is added +% to cluster pop. nan, if the cluster has changed since the value was last +% calculated. Always nan, if pop is ind's own cluster. +% +% JOIN_DIFFERENCE(pop1,pop2) = tells the change in logml if pop1 and pop2 +% are combined. nan, if either cluster has changed since the value was last +% calculated. +% ***********Doc end***************** + +logml = computeTotalLogml; + +disp('The beginning:'); +% disp(['Partition: ' num2str(PARTITION')]); +disp(['Nclusters: ' num2str(length(unique(PARTITION)))]); +disp(['Log(ml*prior): ' num2str(logml)]); +disp(' '); + + +nnotEmptyPops = length(unique(PARTITION)); +if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end +end + +% START SEARCH OF THE BEST PARTITION: + +vipu = zeros(1,14); +if interactive + roundTypes = input('Input steps: '); + if ischar(roundTypes), roundTypes = str2num(roundTypes); end +end +ready = 0; + + +while ready ~= 1 + +% disp(['Performing steps: ' num2str(roundTypes)]); + + for n = 1:length(roundTypes) + round = roundTypes(n); + moveCounter = 0; + + if round==1 && vipu(1)==0 % move an individual to another population + +% inds = randperm(nINDS); + inds = getMoveInds(orig_dist,nINDS); % get inds to be moved + + for ind = inds(:)' + update_difference_tables(ind, c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators, logml); + tmpDiff = REMOVAL_DIFFERENCE(ind) + ADDITION_DIFFERENCE(ind,:); + tmpDiff(PARTITION(ind)) = 0; + [maxChange, maxIndex] = max(tmpDiff); + if maxChange>1e-5 + updateGlobalVariables(ind, maxIndex, c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators); +% fprintf('moving from %d to %d.\n',PARTITION(ind),maxIndex) + logml = computeTotalLogml(); + moveCounter = moveCounter+1; + vipu = zeros(1,14); + + nnotEmptyPops = length(unique(PARTITION)); + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + end + end + if moveCounter==0, vipu(1)=1; end + disp(['Step 1: ' num2str(moveCounter) ' individuals were moved.']); + + elseif round==2 && vipu(2)==0 % join two populations + + update_join_difference(c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators, logml); + [maxChange, aux] = max(JOIN_DIFFERENCE(:)); + [i1, i2] = ind2sub([nPOPS,nPOPS],aux); + + if maxChange>1e-5 + tmpInds = find(PARTITION==i1); + updateGlobalVariables(tmpInds, i2, c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators); + logml = computeTotalLogml; + + disp(['Step 2: Clusters ' num2str(i1) ' and ' num2str(i2) ' combined.']); + vipu = zeros(1,14); + + nnotEmptyPops = length(unique(PARTITION)); + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + else + disp('Step 2: no changes.'); + vipu(2)=1; + end + elseif ismember(round, 3:4) && vipu(round)==0 % Split a population, and move one subpopulation to another population + + pops = randperm(nPOPS); + + splitFlags = zeros(nPOPS,1); + for pop = pops(:)' + + maxChange = 0; + indsToBeMoved = []; + + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2>4 + + if round==3 + dist3 = getDistance(inds2, orig_dist, nINDS); + npops2 = min(20, floor(ninds2 / 5)); %Moneenko osaan jaetaan + elseif round==4 + dist3 = getDistance(inds2, orig_dist, nINDS); + npops2 = 2; + end + + Z3 = linkage(dist3); + T3 = cluster_own(Z3, npops2); + + for i = 1:npops2 + indsX = inds2(T3==i); indsX = indsX'; + tmpChanges = calcLogmlChanges(indsX, c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators, logml); + [tmpMaxChange, tmpMaxPop] = max(tmpChanges); + if tmpMaxChange>maxChange + maxChange = tmpMaxChange; + % i1 = pop; + i2 = tmpMaxPop; + indsToBeMoved = indsX; + end + end + if maxChange>1e-5 + updateGlobalVariables(indsToBeMoved, i2, c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators); + logml = computeTotalLogml; + splitFlags(pop)=1; + + nnotEmptyPops = length(unique(PARTITION)); + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + end + end + end + if any(splitFlags) + disp(['Step ' num2str(round) ': ' num2str(sum(splitFlags)) ' populations were split.']); + vipu = zeros(1,14); + else + disp(['Step ' num2str(round) ': no changes.']); + vipu(round)=1; + end + end + end + + if interactive + roundTypes = input('Input extra steps: '); + if ischar(roundTypes), roundTypes = str2num(roundTypes); end + else + roundTypes = []; + end + + if isempty(roundTypes) + ready = 1; + end +end + +%disp(' '); +disp('BEST PARTITION: '); +%disp(['Partition: ' num2str(PARTITION')]); +disp(['Nclusters: ' num2str(length(unique(PARTITION)))]); +disp(['Log(ml): ' num2str(logml)]); +disp(' '); + +nPOPS= rmEmptyPopulation(c.locCliques, c.locSeparators); +ADDITION_DIFFERENCE(:) = NaN; +REMOVAL_DIFFERENCE(:) = NaN; + +logmldiff = zeros(nINDS,nPOPS); % the change of logml if individual i is moved to group j +for i=1:nINDS + update_difference_tables(i, c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators, logml); + logmldiff(i,:) = REMOVAL_DIFFERENCE(i)+ADDITION_DIFFERENCE(i,:); + if all(isnan(logmldiff(i,:))) + keyboard + end +end +logmldiff(isnan(logmldiff))=0; +partition = PARTITION; + + + + +%---------------------------------------------------------------------------- + + +function [dist2, dind1, dind2] = getDistance(inds2, dist_orig, ninds) +% pick out the distrances between samples in "inds2" from "dist_orig" +% dist_orig specifies the distances of (1,2),(1,3),(1,4)......(ninds-1,ninds) +% Lu Cheng, 22.06.2011 + +if ~issorted(inds2) + error('inds2 is not in ascending order!'); +end + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +irow = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(irow, 1) = inds2(i); + apu(irow, 2) = inds2(j); + irow = irow+1; + end +end + +dind1 = apu(:,1); +dind2 = apu(:,2); + +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist_orig(apu); + + +%--------------------------------------------------------------- + + +function inds = getMoveInds(dist_orig, ninds) +% get individual indexs to be moved to another cluster +% we always take the 30% individuals of each cluster which are most distant +% to each other +% Lu Cheng, 25.05.2011 + +global PARTITION; + +pops = unique(PARTITION); +inds = []; + +for tmpPop = pops(:)' + tmpInds = find(PARTITION == tmpPop)'; + + if(length(tmpInds)<20) + inds = [inds tmpInds(:)']; %#ok + continue; + end + + [tmpDist, dind1, dind2] = getDistance(tmpInds,dist_orig,ninds); + tmpSDist = sort(tmpDist,'Descend'); + tmpInds2 = find(tmpDist>tmpSDist(round(length(tmpSDist)*0.3))); + tmpInds3 = union(unique(dind1(tmpInds2)), unique(dind2(tmpInds2))); + inds = [inds tmpInds3(:)']; %#ok +end + + + + + +% ------------------------------------------------------------ + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. +global PARTITION; + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end \ No newline at end of file diff --git a/matlab/spatial/private/model_search_pregroup.m b/matlab/spatial/private/model_search_pregroup.m new file mode 100644 index 0000000..2c57d1a --- /dev/null +++ b/matlab/spatial/private/model_search_pregroup.m @@ -0,0 +1,385 @@ +function [partition, logml, partitionSummary, logmldiff] = model_search_pregroup(c, pgPart, pgDist, roundTypes, nMaxPops) +% This function clusters DNA alignment using "codon" model in Corander and Tang's +% paper: Bayesian analysis of population structure based on linked +% molecular information (2007), Mathematical Biosciences +% c: preprocessed data for the sequence alignment +% pgPart: partition which assign sequences to pregroups +% pgDist: distances between the pregroups +% (1,2)(1,3)(1,4)...(2,3)(2,4).....(3,4)...(n-1,n) +% roundTypes: array of operation types + +% Lu Cheng +% 21.03.2012 + +interactive = false; + +global PARTITION; +global CQ_COUNTS;global SUM_CQ_COUNTS; +global SP_COUNTS;global SUM_SP_COUNTS; +global CQ_PRIOR; global SP_PRIOR; +global LOGML_TABLE; +global ADDITION_DIFFERENCE; +global REMOVAL_DIFFERENCE; +global JOIN_DIFFERENCE; + +global LOC_SP_COUNTS; +global LOC_CQ_COUNTS; + +clearGlobalVars; + +nINDS = c.nSeq; +nPOPS = nMaxPops; + +% load pregroup information +nPregroup = length(unique(pgPart)); +if nPregroupworstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end +end + +% START SEARCH OF THE BEST PARTITION: + +vipu = zeros(1,14); +if interactive + roundTypes = input('Input steps: '); + if ischar(roundTypes), roundTypes = str2num(roundTypes); end +end +ready = 0; + + +while ready ~= 1 + +% disp(['Performing steps: ' num2str(roundTypes)]); + + for n = 1:length(roundTypes) + round = roundTypes(n); + moveCounter = 0; + + if round==1 && vipu(1)==0 % move an individual to another population + + pgInds = getMoveInds(pgPart,pgDist,nPregroup); % get pregroup inds to be moved + + for pgind = pgInds(:)' +% inds = cell2mat(pregroups(pgInds)); + tmpInds = pregroups{pgind}; + tmpChanges = calcLogmlChanges(tmpInds, c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators, logml); + + [maxChange, maxIndex] = max(tmpChanges); + if maxChange>1e-5 + updateGlobalVariables(tmpInds, maxIndex, c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes,c.locCliques, c.locSeparators); +% fprintf('moving from %d to %d.\n',PARTITION(ind),maxIndex) + logml = computeTotalLogml(); + moveCounter = moveCounter+length(pgInds); + vipu = zeros(1,14); + + nnotEmptyPops = length(unique(PARTITION)); + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + end + end + if moveCounter==0, vipu(1)=1; end + disp(['Step 1: ' num2str(moveCounter) ' pregroups were moved.']); + + elseif round==2 && vipu(2)==0 % join two populations + + update_join_difference(c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators, logml); + [maxChange, aux] = max(JOIN_DIFFERENCE(:)); + [i1, i2] = ind2sub([nPOPS,nPOPS],aux); + + if maxChange>1e-5 + tmpInds = find(PARTITION==i1); + updateGlobalVariables(tmpInds, i2, c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators); + logml = computeTotalLogml; + + disp(['Step 2: Clusters ' num2str(i1) ' and ' num2str(i2) ' combined.']); + vipu = zeros(1,14); + + nnotEmptyPops = length(unique(PARTITION)); + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + else + disp('Step 2: no changes.'); + vipu(2)=1; + end + elseif ismember(round, 3:4) && vipu(round)==0 % Split a population, and move one subpopulation to another population + + pops = randperm(nPOPS); + + splitFlags = zeros(nPOPS,1); + for pop = pops(:)' + + maxChange = 0; + indsToBeMoved = []; + + inds2 = find(PARTITION==pop); + pgInds2 = unique(pgPart(inds2)); + nPgInds2 = length(unique(pgPart(inds2))); + if nPgInds2>4 + + if round==3 + dist3 = getDistance(pgInds2,pgDist,nPregroup); + npops2 = min(20, floor(nPgInds2 / 5)); + elseif round==4 + dist3 = getDistance(pgInds2,pgDist,nPregroup); + npops2 = 2; + end + + Z3 = linkage(dist3(:)','complete'); + T3 = cluster(Z3, 'maxclust', npops2); + + for i = 1:npops2 + indsX = pgInds2(T3==i); + indsX = cell2mat(pregroups(indsX)); + tmpChanges = calcLogmlChanges(indsX, c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators, logml); + [tmpMaxChange, tmpMaxPop] = max(tmpChanges); + if tmpMaxChange>maxChange + maxChange = tmpMaxChange; + % i1 = pop; + i2 = tmpMaxPop; + indsToBeMoved = indsX; + end + end + if maxChange>1e-5 + updateGlobalVariables(indsToBeMoved, i2, c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators); + logml = computeTotalLogml; + splitFlags(pop)=1; + + nnotEmptyPops = length(unique(PARTITION)); + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + end + end + end + if any(splitFlags) + disp(['Step ' num2str(round) ': ' num2str(sum(splitFlags)) ' populations were split.']); + vipu = zeros(1,14); + else + disp(['Step ' num2str(round) ': no changes.']); + vipu(round)=1; + end + end + end + + if interactive + roundTypes = input('Input extra steps: '); + if ischar(roundTypes), roundTypes = str2num(roundTypes); end + else + roundTypes = []; + end + + if isempty(roundTypes) + ready = 1; + end +end + +% disp(' '); +disp('BEST PARTITION: '); +% disp(['Partition: ' num2str(PARTITION')]); +disp(['Nclusters: ' num2str(length(unique(PARTITION)))]); +disp(['Log(ml): ' num2str(logml)]); +disp(' '); + +nPOPS = rmEmptyPopulation(c.locCliques, c.locSeparators); + +logmldiff = zeros(nPregroup,nPOPS); % the change of logml if pregroup i is moved to group j +for i=1:nPregroup + tmpInds = pregroups{i}; + tmpChanges = calcLogmlChanges(tmpInds, c.cqData, c.nMaxCqCodes, ... + c.spData, c.nMaxSpCodes, c.locCliques, c.locSeparators, logml); + logmldiff(i,:) = tmpChanges'; +end +logmldiff(isnan(logmldiff))=0; + +partition = zeros(nPregroup,1); +for i=1:nPregroup + partition(i)=unique(PARTITION(pgPart==i)); +end + +%---------------------------------------------------------------------------- + + +function [dist2, dind1, dind2] = getDistance(inds2, origDist, ninds) +% pick out the distrances between samples in "inds2" from "origDist" +% origDist specifies the distances of (1,2),(1,3),(1,4)......(ninds-1,ninds) +% Lu Cheng, 22.06.2011 + +if ~issorted(inds2) + error('inds2 is not in ascending order!'); +end + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +irow = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(irow, 1) = inds2(i); + apu(irow, 2) = inds2(j); + irow = irow+1; + end +end + +dind1 = apu(:,1); +dind2 = apu(:,2); + +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = origDist(apu); + +%--------------------------------------------------------------- + + +function inds = getMoveInds(pgPart, pgDist, nPregroup) +% get pregroup indexs to be moved to another cluster +% we always take the 35% pregroups of each cluster which are most distant +% to each other +% Lu Cheng, 22.06.2011 + +global PARTITION; + +pops = unique(PARTITION); +inds = []; + +for tmpPop = pops(:)' + tmpInds = unique(pgPart(PARTITION==tmpPop)); + + if(length(tmpInds)<20) + inds = [inds tmpInds(:)']; %#ok + continue; + end + + [tmpDist, dind1, dind2] = getDistance(tmpInds,pgDist,nPregroup); + tmpVal = quantile(tmpDist,0.65); + tmpInds2 = find(tmpDist>tmpVal); + tmpInds3 = union(unique(dind1(tmpInds2)), unique(dind2(tmpInds2))); + inds = [inds tmpInds3(:)']; %#ok +end + + +% ------------------------------------------------------------ + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. +global PARTITION; + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + + + + diff --git a/matlab/spatial/private/preprocAln.m b/matlab/spatial/private/preprocAln.m new file mode 100644 index 0000000..517791b --- /dev/null +++ b/matlab/spatial/private/preprocAln.m @@ -0,0 +1,97 @@ +function c = preprocAln(alnMat) +% This function preprocess the alignment matrix to cliques and separators +% Lu Cheng, 24.05.2011 + +[nSeq nLoci] = size(alnMat); + +alnCell = mat2cell(alnMat,nSeq,ones(1,nLoci)); + +arrUniqBase = cellfun(@unique,alnCell,'UniformOutput',false); % unique base at each loci +arrUniqBaseNum = cellfun(@length,arrUniqBase); + +arrCqNum = arrUniqBaseNum(1:end-2).*arrUniqBaseNum(2:end-1).*arrUniqBaseNum(3:end); +arrSpNum = arrUniqBaseNum(2:end-2).*arrUniqBaseNum(3:end-1); + +nMaxCqCodes = max(arrCqNum); +nMaxSpCodes = max(arrSpNum); + +cqCodes = cellfun(@myProd,arrUniqBase(1:end-2),arrUniqBase(2:end-1),arrUniqBase(3:end), ... + 'UniformOutput',false); +spCodes = cellfun(@myProd,arrUniqBase(2:end-2),arrUniqBase(3:end-1), ... + 'UniformOutput',false); + +cqData = zeros(nSeq,length(cqCodes)); +spData = zeros(nSeq,length(spCodes)); + +cqCounts = zeros(nMaxCqCodes,length(cqCodes)); +spCounts = zeros(nMaxSpCodes,length(spCodes)); + +cqPrior = ones(nMaxCqCodes,length(cqCodes)); +spPrior = ones(nMaxSpCodes,length(spCodes)); + +for i=1:nLoci-2 + + nCodeTmp = size(cqCodes{i},1); + for j=1:nCodeTmp + tmpInds = ismember(alnMat(:,i:i+2),cqCodes{i}(j,:),'rows'); + cqData(tmpInds,i) = j; + cqCounts(j,i) = sum(tmpInds); + end + + cqPrior(1:nCodeTmp,i) = 1/nCodeTmp; + + if i==1 + continue; + end + + k=i-1; + nCodeTmp = size(spCodes{k},1); + for j=1:nCodeTmp + tmpInds = ismember(alnMat(:,i:i+1),spCodes{k}(j,:),'rows'); + spData(tmpInds,k) = j; + spCounts(j,k) = sum(tmpInds); + end + + spPrior(1:nCodeTmp,k) = 1/nCodeTmp; +end + +c.nSeq = nSeq; +% c.alnMat = alnMat; + +c.arrUniqBase = arrUniqBase; +c.arrUniqBaseNum = arrUniqBaseNum; + +c.nMaxCqCodes = nMaxCqCodes; +c.nMaxSpCodes = nMaxSpCodes; + +c.cqCodes = cqCodes; +c.spCodes = spCodes; + +c.cqData = cqData; +c.spData = spData; + +c.cqCounts = cqCounts; +c.spCounts = spCounts; + +c.cqPrior = cqPrior; +c.spPrior = spPrior; + + +function y = myProd(varargin) +% calculate the cartesian product for the input +% Lu Cheng, 24.05.2011 + +if nargin==2 + set1 = varargin{1}; + set2 = varargin{2}; + [t1 t2] = meshgrid(set1,set2); + y = [t1(:) t2(:)]; +elseif nargin==3 + set1 = varargin{1}; + set2 = varargin{2}; + set3 = varargin{3}; + [t1 t2 t3] = meshgrid(set1,set2,set3); + y = [t1(:) t2(:) t3(:)]; +else + y = []; +end \ No newline at end of file diff --git a/matlab/spatial/private/rmEmptyPopulation.m b/matlab/spatial/private/rmEmptyPopulation.m new file mode 100644 index 0000000..93f3b95 --- /dev/null +++ b/matlab/spatial/private/rmEmptyPopulation.m @@ -0,0 +1,41 @@ +function [npops notEmpty] = rmEmptyPopulation(locCliques,locSeparators) +% remove empty populations from CQ_COUNTS and SUM_CQ_COUNTS, SP_COUNTS, +% SUM_SP_COUNTS +% update PARTITION +% Lu Cheng, 15.12.2012 + +global CQ_COUNTS; global SUM_CQ_COUNTS; +global SP_COUNTS; global SUM_SP_COUNTS; +global PARTITION; + +global LOGML_TABLE; +global ADDITION_DIFFERENCE; +global JOIN_DIFFERENCE; + +global LOC_CQ_COUNTS; +global LOC_SP_COUNTS; + +notEmpty = find(any(SUM_CQ_COUNTS,1) & any(SUM_SP_COUNTS,1)); + +CQ_COUNTS = CQ_COUNTS(:,:,notEmpty); +SP_COUNTS = SP_COUNTS(:,:,notEmpty); + +SUM_CQ_COUNTS = SUM_CQ_COUNTS(:,notEmpty); +SUM_SP_COUNTS = SUM_SP_COUNTS(:,notEmpty); + +LOGML_TABLE = LOGML_TABLE(notEmpty); +ADDITION_DIFFERENCE = ADDITION_DIFFERENCE(:,notEmpty); +JOIN_DIFFERENCE = JOIN_DIFFERENCE(notEmpty,notEmpty); + +for i=1:length(notEmpty) + apu = (PARTITION==notEmpty(i)); + PARTITION(apu)=i; +end + +npops = length(notEmpty); + +[cliqcounts, sepcounts] = computeCounts(locCliques, locSeparators, npops); + +LOC_CQ_COUNTS = cliqcounts; +LOC_SP_COUNTS = sepcounts; + diff --git a/matlab/spatial/private/updateGlobalVariables.m b/matlab/spatial/private/updateGlobalVariables.m new file mode 100644 index 0000000..154adf7 --- /dev/null +++ b/matlab/spatial/private/updateGlobalVariables.m @@ -0,0 +1,61 @@ +function updateGlobalVariables(inds, i2, cqData, nCqCodes, spData, nSpCodes, locCliques, locSeparators) +% this function moves the samples specified by "inds" to cluser i2 +% then update all the global variables, "inds" are supposed to come from the +% same cluster +% Lu Cheng, 15.12.2012 + +global PARTITION; +global CQ_COUNTS; global SUM_CQ_COUNTS; +global SP_COUNTS; global SUM_SP_COUNTS; +global ADDITION_DIFFERENCE; +global REMOVAL_DIFFERENCE; +global JOIN_DIFFERENCE; + +global LOC_SP_COUNTS; +global LOC_CQ_COUNTS; + +i1 = PARTITION(inds(1)); +PARTITION(inds)=i2; + +[diffCqCounts diffCqSumCounts]= computeDiffInCounts(inds, cqData, nCqCodes); +[diffSpCounts diffSpSumCounts]= computeDiffInCounts(inds, spData, nSpCodes); + +diffLocCqCounts = computeDiffInCliqCounts(locCliques, inds); +diffLocSpCounts = computeDiffInCliqCounts(locSeparators, inds); + +CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1) - diffCqCounts; +SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1) - diffSpCounts; + +SUM_CQ_COUNTS(:,i1) = SUM_CQ_COUNTS(:,i1) - diffCqSumCounts; +SUM_SP_COUNTS(:,i1) = SUM_SP_COUNTS(:,i1) - diffSpSumCounts; + +LOC_CQ_COUNTS(:,i1) = LOC_CQ_COUNTS(:,i1) - diffLocCqCounts; +LOC_SP_COUNTS(:,i1) = LOC_SP_COUNTS(:,i1) - diffLocSpCounts; + +CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2) + diffCqCounts; +SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2) + diffSpCounts; + +SUM_CQ_COUNTS(:,i2) = SUM_CQ_COUNTS(:,i2) + diffCqSumCounts; +SUM_SP_COUNTS(:,i2) = SUM_SP_COUNTS(:,i2) + diffSpSumCounts; + +LOC_CQ_COUNTS(:,i2) = LOC_CQ_COUNTS(:,i2) + diffLocCqCounts; +LOC_SP_COUNTS(:,i2) = LOC_SP_COUNTS(:,i2) + diffLocSpCounts; + +updateLogmlTable([i1 i2]); + +REMOVAL_DIFFERENCE(PARTITION==i1) = nan; +REMOVAL_DIFFERENCE(PARTITION==i2) = nan; +ADDITION_DIFFERENCE(:,[i1 i2]) = nan; + +JOIN_DIFFERENCE(:,i2) = nan; +JOIN_DIFFERENCE(i2,:) = nan; + +if ~any(PARTITION==i1) + % i1 became empty + JOIN_DIFFERENCE(:,i1) = 0; + JOIN_DIFFERENCE(i1,:) = 0; + JOIN_DIFFERENCE(i1,i1) = nan; +else + JOIN_DIFFERENCE(:,i1) = nan; + JOIN_DIFFERENCE(i1,:) = nan; +end \ No newline at end of file diff --git a/matlab/spatial/private/updateLogmlTable.m b/matlab/spatial/private/updateLogmlTable.m new file mode 100644 index 0000000..65d7fd5 --- /dev/null +++ b/matlab/spatial/private/updateLogmlTable.m @@ -0,0 +1,37 @@ +function updateLogmlTable(pops) +% Updates global variables LOGML_TABLE, npops*1 array, logml values for +% each population given in "pops" +% After the updates, the values are based on the current values of the +% global variables CQ_COUNTS, SUM_CQ_COUNTS, SP_COUNTS, SUM_SP_COUNTS +% Lu Cheng, 25.05.2011 + +global CQ_COUNTS; global SUM_CQ_COUNTS; +global SP_COUNTS; global SUM_SP_COUNTS; +global CQ_PRIOR; global SP_PRIOR; + +global LOGML_TABLE; + +tmpN = length(pops); +tmpCqPrior = repmat(CQ_PRIOR,[1 1 tmpN]); +tmpSpPrior = repmat(SP_PRIOR,[1 1 tmpN]); + +term1 = 0-gammaln(1+SUM_CQ_COUNTS(:,pops)); +term2 = sum(gammaln(tmpCqPrior+CQ_COUNTS(:,:,pops))-gammaln(tmpCqPrior) , 1); +if length(pops) > 1 + term2 = squeeze(term2); +else + term2 = term2'; +end + +term3 = 0-gammaln(1+SUM_SP_COUNTS(:,pops)); +term4 = sum(gammaln(tmpSpPrior+SP_COUNTS(:,:,pops))-gammaln(tmpSpPrior) , 1); + +if length(pops) > 1 + term4 = squeeze(term4); +else + term4 = term4'; +end + +LOGML_TABLE(pops) = sum(term1+term2) - sum(term3+term4); + +%---------------------------------------------------------------------- \ No newline at end of file diff --git a/matlab/spatial/private/update_difference_tables.m b/matlab/spatial/private/update_difference_tables.m new file mode 100644 index 0000000..2094684 --- /dev/null +++ b/matlab/spatial/private/update_difference_tables.m @@ -0,0 +1,92 @@ +function update_difference_tables(ind, cqData, nCqLetter, ... + spData, nSpLetter, locCliques, locSeparators,logml) +% update ADDITION_DIFFERENCE and REMOVAL_DIFFERENCE +% Lu Cheng, 15.12.2012 + +global CQ_COUNTS; global SUM_CQ_COUNTS; +global SP_COUNTS; global SUM_SP_COUNTS; +global PARTITION; +global ADDITION_DIFFERENCE; +global REMOVAL_DIFFERENCE; + +global LOC_CQ_COUNTS; +global LOC_SP_COUNTS; + +rem_old = REMOVAL_DIFFERENCE; +add_old = ADDITION_DIFFERENCE; + +[diffCqCounts diffCqSumCounts] = computeDiffInCounts(ind, cqData, nCqLetter); +[diffSpCounts diffSpSumCounts] = computeDiffInCounts(ind, spData, nSpLetter); +diffLocCqCounts = computeDiffInCliqCounts(locCliques, ind); +diffLocSpCounts = computeDiffInCliqCounts(locSeparators, ind); + +i1 = PARTITION(ind); + +if isnan(rem_old(ind)) + % Update removal difference for the individual: + % note that we did NOT add the removed item to other clusters + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1) - diffCqCounts; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1) - diffSpCounts; + + SUM_CQ_COUNTS(:,i1) = SUM_CQ_COUNTS(:,i1) - diffCqSumCounts; + SUM_SP_COUNTS(:,i1) = SUM_SP_COUNTS(:,i1) - diffSpSumCounts; + + LOC_CQ_COUNTS(:,i1) = LOC_CQ_COUNTS(:,i1) - diffLocCqCounts; + LOC_SP_COUNTS(:,i1) = LOC_SP_COUNTS(:,i1) - diffLocSpCounts; + +% PARTITION(ind) = -1; + updateLogmlTable(i1); + logml_new = computeTotalLogml(); + rem_old(ind) = logml_new-logml; + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1) + diffCqCounts; + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1) + diffSpCounts; + + SUM_CQ_COUNTS(:,i1) = SUM_CQ_COUNTS(:,i1) + diffCqSumCounts; + SUM_SP_COUNTS(:,i1) = SUM_SP_COUNTS(:,i1) + diffSpSumCounts; + + LOC_CQ_COUNTS(:,i1) = LOC_CQ_COUNTS(:,i1) + diffLocCqCounts; + LOC_SP_COUNTS(:,i1) = LOC_SP_COUNTS(:,i1) + diffLocSpCounts; + +% PARTITION(ind) = i1; + updateLogmlTable(i1); +end + +new_pops = isnan(add_old(ind,:)); +new_pops(i1) = 0; % Own cluster needs never be calculated. +new_pops = find(new_pops); + +for i2 = new_pops(:)' + % Update addition differences for the individual: + % note that we did NOT remove the item + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2) + diffCqCounts; + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2) + diffSpCounts; + + SUM_CQ_COUNTS(:,i2) = SUM_CQ_COUNTS(:,i2) + diffCqSumCounts; + SUM_SP_COUNTS(:,i2) = SUM_SP_COUNTS(:,i2) + diffSpSumCounts; + + LOC_CQ_COUNTS(:,i2) = LOC_CQ_COUNTS(:,i2) + diffLocCqCounts; + LOC_SP_COUNTS(:,i2) = LOC_SP_COUNTS(:,i2) + diffLocSpCounts; + +% PARTITION(ind) = i2; + updateLogmlTable(i2); + logml_new = computeTotalLogml(); + add_old(ind,i2) = logml_new - logml; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2) - diffCqCounts; + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2) - diffSpCounts; + + SUM_CQ_COUNTS(:,i2) = SUM_CQ_COUNTS(:,i2) - diffCqSumCounts; + SUM_SP_COUNTS(:,i2) = SUM_SP_COUNTS(:,i2) - diffSpSumCounts; + + LOC_CQ_COUNTS(:,i2) = LOC_CQ_COUNTS(:,i2) - diffLocCqCounts; + LOC_SP_COUNTS(:,i2) = LOC_SP_COUNTS(:,i2) - diffLocSpCounts; + +% PARTITION(ind) = i1; + updateLogmlTable(i2); +end + +REMOVAL_DIFFERENCE = rem_old; +ADDITION_DIFFERENCE = add_old; + +%--------------------------------------------------------------------- \ No newline at end of file diff --git a/matlab/spatial/private/update_join_difference.m b/matlab/spatial/private/update_join_difference.m new file mode 100644 index 0000000..41e8449 --- /dev/null +++ b/matlab/spatial/private/update_join_difference.m @@ -0,0 +1,83 @@ +function update_join_difference(cqData, nCqCodes, spData, nSpCodes, locCliques, locSeparators, logml) +% update JOIN_DIFFERENCE +% Lu Cheng, 15.12.2012 + +global CQ_COUNTS; global SUM_CQ_COUNTS; +global SP_COUNTS; global SUM_SP_COUNTS; +global PARTITION; +global JOIN_DIFFERENCE; + +global LOC_CQ_COUNTS; +global LOC_SP_COUNTS; + +npops = size(CQ_COUNTS,3); + +for i1 = 1:npops-1 + indsToBeMoved = find(PARTITION==i1); + if isempty(indsToBeMoved) + % Cluster i1 is empty + JOIN_DIFFERENCE(i1,(i1+1):npops) = 0; + JOIN_DIFFERENCE((i1+1):npops,i1) = 0; + else + [diffCqCounts diffCqSumCounts] = computeDiffInCounts(indsToBeMoved, cqData, nCqCodes); + [diffSpCounts diffSpSumCounts] = computeDiffInCounts(indsToBeMoved, spData, nSpCodes); + diffLocCqCounts = computeDiffInCliqCounts(locCliques, indsToBeMoved); + diffLocSpCounts = computeDiffInCliqCounts(locSeparators, indsToBeMoved); + + unknown_pops = find(isnan(JOIN_DIFFERENCE(i1,(i1+1):end))); + unknown_pops = unknown_pops+i1; + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1) - diffCqCounts; + SUM_CQ_COUNTS(:,i1) = SUM_CQ_COUNTS(:,i1) - diffCqSumCounts; + + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1) - diffSpCounts; + SUM_SP_COUNTS(:,i1) = SUM_SP_COUNTS(:,i1) - diffSpSumCounts; + + LOC_CQ_COUNTS(:,i1) = LOC_CQ_COUNTS(:,i1) - diffLocCqCounts; + LOC_SP_COUNTS(:,i1) = LOC_SP_COUNTS(:,i1) - diffLocSpCounts; + +% PARTITION(indsToBeMoved) = -1; + updateLogmlTable(i1); + + for i2 = unknown_pops + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2) + diffCqCounts; + SUM_CQ_COUNTS(:,i2) = SUM_CQ_COUNTS(:,i2) + diffCqSumCounts; + + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2) + diffSpCounts; + SUM_SP_COUNTS(:,i2) = SUM_SP_COUNTS(:,i2) + diffSpSumCounts; + + LOC_CQ_COUNTS(:,i2) = LOC_CQ_COUNTS(:,i2) + diffLocCqCounts; + LOC_SP_COUNTS(:,i2) = LOC_SP_COUNTS(:,i2) + diffLocSpCounts; + +% PARTITION(indsToBeMoved) = i2; + updateLogmlTable(i2); + logml_new = computeTotalLogml(); + JOIN_DIFFERENCE(i1,i2) = logml_new-logml; + JOIN_DIFFERENCE(i2,i1) = logml_new-logml; + + CQ_COUNTS(:,:,i2) = CQ_COUNTS(:,:,i2) - diffCqCounts; + SUM_CQ_COUNTS(:,i2) = SUM_CQ_COUNTS(:,i2) - diffCqSumCounts; + + SP_COUNTS(:,:,i2) = SP_COUNTS(:,:,i2) - diffSpCounts; + SUM_SP_COUNTS(:,i2) = SUM_SP_COUNTS(:,i2) - diffSpSumCounts; + + LOC_CQ_COUNTS(:,i2) = LOC_CQ_COUNTS(:,i2) - diffLocCqCounts; + LOC_SP_COUNTS(:,i2) = LOC_SP_COUNTS(:,i2) - diffLocSpCounts; + +% PARTITION(indsToBeMoved) = -1; + updateLogmlTable(i2); + end + + CQ_COUNTS(:,:,i1) = CQ_COUNTS(:,:,i1) + diffCqCounts; + SUM_CQ_COUNTS(:,i1) = SUM_CQ_COUNTS(:,i1) + diffCqSumCounts; + + SP_COUNTS(:,:,i1) = SP_COUNTS(:,:,i1) + diffSpCounts; + SUM_SP_COUNTS(:,i1) = SUM_SP_COUNTS(:,i1) + diffSpSumCounts; + + LOC_CQ_COUNTS(:,i1) = LOC_CQ_COUNTS(:,i1) + diffLocCqCounts; + LOC_SP_COUNTS(:,i1) = LOC_SP_COUNTS(:,i1) + diffLocSpCounts; + +% PARTITION(indsToBeMoved) = i1; + updateLogmlTable(i1); + end +end \ No newline at end of file diff --git a/matlab/spatial/spatialMix.m b/matlab/spatial/spatialMix.m new file mode 100644 index 0000000..fec57eb --- /dev/null +++ b/matlab/spatial/spatialMix.m @@ -0,0 +1,1512 @@ +function [logml, npops, partitionSummary]=spatialMix(c,npopsTaulu) +% Greedy search algorithm with unknown number of classes for spatial +% clustering. + +logml = 1; + +global PARTITION; global COUNTS; +global SUMCOUNTS; +global SEPCOUNTS; global CLIQCOUNTS; +global LOGDIFF; +clearGlobalVars; + +data = c.data; rowsFromInd = c.rowsFromInd; +noalle = c.noalle; adjprior = c.adjprior; priorTerm = c.priorTerm; +cliques = c.cliques; separators = c.separators; rows = c.rows; + +if isfield(c, 'dist') + dist = c.dist; Z = c.Z; +end + +clear c; + + +if nargin < 2; + npopstext = []; + ready = false; + teksti = 'Input upper bound to the number of populations (possibly multiple values): '; + while ready == false + npopstextExtra = inputdlg(teksti ,... + 'Input maximum number of populations',1,{'20'}); + drawnow + if isempty(npopstextExtra) % Painettu Cancel:ia + return + end + npopstextExtra = npopstextExtra{1}; + if length(npopstextExtra)>=255 + npopstextExtra = npopstextExtra(1:255); + npopstext = [npopstext ' ' npopstextExtra]; + teksti = 'The input field length limit (255 characters) was reached. Input more values: '; + else + npopstext = [npopstext ' ' npopstextExtra]; + ready = true; + end + end + clear ready; clear teksti; + if isempty(npopstext) | length(npopstext)==1 + return + else + npopsTaulu = str2num(npopstext); + ykkoset = find(npopsTaulu==1); + npopsTaulu(ykkoset) = []; % Mikäli ykkösiä annettu ylärajaksi, ne poistetaan. + if isempty(npopsTaulu) + return + end + clear ykkoset; + end +end + +nruns = length(npopsTaulu); + +maxnpops = max(npopsTaulu); +logmlBest = -1e50; +partitionSummary = -1e50*ones(30,2,maxnpops); % Tiedot 30 parhaasta partitiosta (npops ja logml) +partitionSummary(:,1,:) = zeros(30,1,maxnpops); +worstLogml = -1e50*ones(1, maxnpops); worstIndex = ones(1, maxnpops); + + +initData = data; +data = initData(:, 1:end-1); + +for run = 1:nruns + npops = npopsTaulu(run); + dispLine; + disp(['Run ' num2str(run) '/' num2str(nruns) ... + ', maximum number of populations ' num2str(npops) '.']); + disp(' '); + disp('Computing initial partition'); + + [initialPartition, counts, sumcounts] = initSpatialMultiMixture(initData, ... + npops, Z, rows, noalle, dist, adjprior, priorTerm,0); + + PARTITION = initialPartition; + [cliqcounts, sepcounts] = computeCounts(cliques, separators, npops); + + COUNTS = zeros(max(noalle), size(data,2),npops); SUMCOUNTS = zeros(npops,size(data,2)); + COUNTS(:,:,1:size(counts,3)) = counts; SUMCOUNTS(1:size(sumcounts,1),:) = sumcounts; + CLIQCOUNTS = cliqcounts; SEPCOUNTS = sepcounts; + + ninds = length(PARTITION); + %maxsize=max([max(noalle) npops]); + %initializeGammaln(ninds, rowsFromInd, maxsize); + logml = computeLogml(adjprior, priorTerm); + + LOGDIFF = repmat(-Inf,ninds,npops); + + nnotEmptyPops = length(unique(PARTITION)); + + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + + clear initialPartition; clear counts; clear sumcounts; + clear cliqcounts; clear sepcounts; + + % PARHAAN MIXTURE-PARTITION ETSIMINEN + roundTypes = [1]; %Ykkösvaiheen sykli kahteen kertaan. + nRoundTypes = 7; + kokeiltu = zeros(nRoundTypes, 1); + ready = 0; vaihe = 1; + + + disp(['Mixture analysis started with initial ' num2str(nnotEmptyPops) ' populations.']); + + while ready ~= 1 + muutoksia = 0; + + disp(['Performing steps: ' num2str(roundTypes)]); + + for n = 1:length(roundTypes) + + round = roundTypes(n); + + if kokeiltu(round) == 1 %Askelta kokeiltu viime muutoksen jälkeen + + elseif round==0 | round==1 %Yksilön siirtäminen toiseen populaatioon. + + inds = randperm(ninds); + muutosNyt = 0; + for ind = inds + i1 = PARTITION(ind); + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data, adjprior, priorTerm, logml, cliques, separators); + + if round==1, [maxMuutos, i2] = max(muutokset); + elseif round==0, [maxMuutos, i2] = arvoSeuraavaTila(muutokset, logml); + end + + if (i1~=i2 & maxMuutos>1e-5) + % Tapahtui muutos + muutoksia = 1; + if muutosNyt == 0 + disp('action 1'); + muutosNyt = 1; + kokeiltu = zeros(nRoundTypes,1); + end + updateGlobalVariables(ind, i2, diffInCounts,... + cliques, separators, adjprior, priorTerm); + logml = logml+maxMuutos; + + nnotEmptyPops = length(unique(PARTITION)); + + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + end + end + if muutosNyt == 0 + kokeiltu(round) = 1; + end + + + elseif round==2 %Populaation yhdistäminen toiseen. + maxMuutos = 0; + + for pop = 1:npops + [muutokset, diffInCounts] = laskeMuutokset2(pop, rows, ... + data, adjprior, priorTerm, logml, cliques, separators); + [isoin, indeksi] = max(muutokset); + if isoin>maxMuutos + maxMuutos = isoin; + i1 = pop; + i2 = indeksi; + diffInCountsBest = diffInCounts; + end + end + + + + if maxMuutos > 1e-5 + disp('action 2'); + muutoksia = 1; + kokeiltu = zeros(nRoundTypes,1); + updateGlobalVariables2(i1,i2, diffInCountsBest, ... + cliques, separators, adjprior, priorTerm); + logml=logml + maxMuutos; + + nnotEmptyPops = length(unique(PARTITION)); + + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + else + kokeiltu(round) = 1; + end + + + elseif round==3 | round==4 %Populaation jakaminen osiin. + maxMuutos = 0; + for pop = 1:npops + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2>5 + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = linkage(dist2'); + if round==3 + npops2 = min(20, floor(ninds2 / 5)); %Moneenko osaan jaetaan + elseif round==4 + npops2 = 2; + end + T2 = cluster_own(Z2, npops2); + + muutokset = laskeMuutokset3(T2, inds2, rows, data, ... + adjprior, priorTerm, pop, logml, cliques, separators); + [isoin, indeksi] = max(muutokset(1:end)); + if isoin>maxMuutos + maxMuutos = isoin; + muuttuvaPop2 = rem(indeksi,npops2); + if muuttuvaPop2==0, muuttuvaPop2 = npops2; end + muuttuvat = inds2(find(T2==muuttuvaPop2)); + i2 = ceil(indeksi/npops2); + end + end + end + if maxMuutos>1e-5 + muutoksia = 1; + if round==3 + disp('action 3'); + else + disp('action 4'); + end + kokeiltu = zeros(nRoundTypes,1); + rivit = []; + for i = 1:length(muuttuvat) + ind = muuttuvat(i); + lisa = rows(ind,1):rows(ind,2); + rivit = [rivit lisa]; + end + diffInCounts = computeDiffInCounts(rivit, size(COUNTS,1), ... + size(COUNTS,2), data); + i1 = PARTITION(muuttuvat(1)); + updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, i2, cliques, separators); + logml = logml + maxMuutos; + + nnotEmptyPops = length(unique(PARTITION)); + + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); end + end + else + kokeiltu(round)=1; + end + + elseif round == 5 | round == 6 + j=0; + muutettu = 0; + %poplogml = POP_LOGML; + partition = PARTITION; + counts = COUNTS; + sumcounts = SUMCOUNTS; + cliqcounts = CLIQCOUNTS; + sepcounts = SEPCOUNTS; + logdiff = LOGDIFF; + + pops = randperm(npops); + while (j < npops & muutettu == 0) + j = j+1; + pop = pops(j); + totalMuutos = 0; + inds = find(PARTITION==pop); + if round == 5 + aputaulu = [inds rand(length(inds),1)]; + aputaulu = sortrows(aputaulu,2); + inds = aputaulu(:,1)'; + elseif round == 6 + inds = returnInOrder(inds, pop, ... + rows, data, adjprior, priorTerm); + end + + i=0; + + while (length(inds) > 0 & i < length(inds)) + i = i+1; + ind =inds(i); + + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data, adjprior, priorTerm, logml, cliques, separators); + muutokset(pop) = -1e50; % Varmasti ei suurin!!! + [maxMuutos, i2] = max(muutokset); + + updateGlobalVariables(ind, i2, diffInCounts, ... + cliques, separators, adjprior, priorTerm); + + totalMuutos = totalMuutos+maxMuutos; + logml = logml+maxMuutos; + if round == 6 + % Lopetetaan heti kun muutos on positiivinen. + if totalMuutos > 1e-5 + i=length(inds); + end + end + end + + if totalMuutos>1e-5 + if round == 5 + disp('action 5'); + elseif round == 6 + disp('action 6'); + end + kokeiltu = zeros(nRoundTypes,1); + muutettu=1; + muutoksia = 1; % Ulompi kirjanpito. + nnotEmptyPops = length(unique(PARTITION)); + + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + else + % Missään vaiheessa tila ei parantunut. + % Perutaan kaikki muutokset. + PARTITION = partition; + SUMCOUNTS = sumcounts; + %POP_LOGML = poplogml; + COUNTS = counts; + logml = logml - totalMuutos; + CLIQCOUNTS = cliqcounts; + SEPCOUNTS = sepcounts; + LOGDIFF = logdiff; + kokeiltu(round)=1; + end + end + clear partition; clear sumcounts; clear counts; + clear cliqcounts; clear sepcounts; + + elseif round == 7 + emptyPop = findEmptyPop(npops); + j = 0; + pops = randperm(npops); + muutoksiaNyt = 0; + if emptyPop == -1 + j = npops; + end + while (j < npops) + j = j +1; + pop = pops(j); + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2 > 5 + partition = PARTITION; + sumcounts = SUMCOUNTS; + counts = COUNTS; + cliqcounts = CLIQCOUNTS; + sepcounts = SEPCOUNTS; + oldLogml = logml; + logdiff = LOGDIFF; + + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = linkage(dist2'); + T2 = cluster_own(Z2, 2); + muuttuvat = inds2(find(T2==1)); + + rivit = []; + for i = 1:length(muuttuvat) + ind = muuttuvat(i); + lisa = rows(ind,1):rows(ind,2); + rivit = [rivit lisa]; + end + diffInCounts = computeDiffInCounts(rivit, size(COUNTS,1), ... + size(COUNTS,2), data); + updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, emptyPop, cliques, separators); + + logml = computeLogml(adjprior, priorTerm); + + muutettu = 1; + while (muutettu == 1) + muutettu = 0; + % Siirretään yksilöitä populaatioiden välillä + muutokset = laskeMuutokset5(inds2, rows, data, ... + adjprior, priorTerm, logml, cliques, separators, pop, emptyPop); + + [maxMuutos, indeksi] = max(muutokset); + muuttuva = inds2(indeksi); + + if (PARTITION(muuttuva) == pop) + i2 = emptyPop; + else + i2 = pop; + end + + if maxMuutos > 1e-5 + rivit = rows(muuttuva,1):rows(muuttuva,2); + diffInCounts = computeDiffInCounts(rivit, size(COUNTS,1), ... + size(COUNTS,2), data); + updateGlobalVariables3(muuttuva, diffInCounts, ... + adjprior, priorTerm, i2, cliques, separators); + muutettu = 1; + logml = logml + maxMuutos; + end + + end + + if logml > oldLogml + 1e-5 + muutoksia = 1; + nnotEmptyPops = length(unique(PARTITION)); + + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); end + end + if muutoksiaNyt == 0 + disp('action 7'); + muutoksiaNyt = 1; + end + kokeiltu = zeros(nRoundTypes,1); + j = npops; % Lopetetaan, koska muutos syntyi + else + %palutetaan vanhat arvot + PARTITION = partition; + SUMCOUNTS = sumcounts; + COUNTS = counts; + CLIQCOUNTS = cliqcounts; + SEPCOUNTS = sepcounts; + LOGDIFF = logdiff; + logml = oldLogml; + end + + end + + end + + if muutoksiaNyt == 0 + kokeiltu(round)=1; + end + + end + end + + + if muutoksia == 0 + if vaihe==1 + vaihe = 2; + elseif vaihe==2 + vaihe = 3; + elseif vaihe==3 + vaihe = 4; + elseif vaihe==4; + vaihe = 5; + elseif vaihe==5 + ready = 1; + end + else + muutoksia = 0; + end + + if ready==0 + if vaihe==1 + roundTypes=[1]; + elseif vaihe==2 + roundTypes=[2 1]; + elseif vaihe==3 + roundTypes=[5 5 7]; + elseif vaihe==4 + roundTypes=[4 3 1]; + elseif vaihe==5 + roundTypes=[6 2 7 3 4 1]; + end + end + end + + % TALLENNETAAN + + npops = poistaTyhjatPopulaatiot(npops); + %POP_LOGML = computePopulationLogml(1:npops, adjprior, priorTerm); + disp(['Found partition with ' num2str(npops) ' populations.']); + disp(['Log(ml) = ' num2str(logml)]); + disp(' '); + + if logml>logmlBest + % Päivitetään parasta löydettyä partitiota. + logmlBest = logml; + npopsBest = npops; + partitionBest = PARTITION; + countsBest = COUNTS; + sumCountsBest = SUMCOUNTS; + %pop_logmlBest = POP_LOGML; + cliqCountsBest = CLIQCOUNTS; + sepCountsBest = SEPCOUNTS; + logdiffbest = LOGDIFF; + end +end + +logml = logmlBest; +npops = npopsBest; +PARTITION = partitionBest; +COUNTS = countsBest; +SUMCOUNTS = sumCountsBest; +%POP_LOGML = pop_logmlBest; +CLIQCOUNTS = cliqCountsBest; +SEPCOUNTS = sepCountsBest; +LOGDIFF = logdiffbest; + +%------------------------------------------------------------------------------------- + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +%global POP_LOGML; POP_LOGML = []; +global SEPCOUNTS; SEPCOUNTS = []; +global CLIQCOUNTS; CLIQCOUNTS = []; +global LOGDIFF; LOGDIFF = []; + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, että annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ssä ei vielä ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyistä partitiota vastaava nclusters:in arvo. Muutoin ei tehdä mitään. + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole vielä kirjattuna summaryyn. + global PARTITION; + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + + +%-------------------------------------------------------------------------- + + +function [suurin, i2] = arvoSeuraavaTila(muutokset, logml) +% Suorittaa yksilön seuraavan tilan arvonnan + +y = logml + muutokset; % siirron jälkeiset logml:t +y = y - max(y); +y = exp(y); +summa = sum(y); +y = y/summa; +y = cumsum(y); + +i2 = rand_disc(y); % uusi kori +suurin = muutokset(i2); + + +%-------------------------------------------------------------------------------------- + + +function svar=rand_disc(CDF) +%returns an index of a value from a discrete distribution using inversion method +slump=rand; +har=find(CDF>slump); +svar=har(1); + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, diffInCounts, ... + cliques, separators, adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun yksilö ind +% siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global CLIQCOUNTS; +global SEPCOUNTS; +global LOGDIFF; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); +diffInSepCounts = computeDiffInCliqCounts(separators, ind); + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + +%POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2(i1, i2, diffInCounts, ... + cliques, separators, adjprior, priorTerm); +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksilöt siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +%global POP_LOGML; +global CLIQCOUNTS; +global SEPCOUNTS; +global LOGDIFF; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +diffInCliqCounts = CLIQCOUNTS(:,i1); +diffInSepCounts = SEPCOUNTS(:,i1); + + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = 0; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = 0; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, i2, cliques, separators); +% Suorittaa globaalien muuttujien päivitykset, kun yksilöt 'muuttuvat' +% siirretään koriin i2. Ennen siirtoa yksilöiden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; global CLIQCOUNTS; +global SUMCOUNTS; global SEPCOUNTS; +global LOGDIFF; +%global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +diffInCliqCounts = computeDiffInCliqCounts(cliques, muuttuvat); +diffInSepCounts = computeDiffInCliqCounts(separators, muuttuvat); + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + +%POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, globalRows, data, ... + adjprior, priorTerm) +% Palauttaa yksilöt järjestyksessä siten, että ensimmäisenä on +% se, jonka poistaminen populaatiosta pop nostaisi logml:n +% arvoa eniten. + +global COUNTS; global SUMCOUNTS; +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind =inds(i); + rows = globalRows(i,1):globalRows(i,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop) = COUNTS(:,:,pop)-diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)-diffInSumCounts; + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior, priorTerm); + COUNTS(:,:,pop) = COUNTS(:,:,pop)+diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)+diffInSumCounts; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset(ind, globalRows, ... + data, adjprior, priorTerm, logml, cliques, separators) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mikä olisi +% muutos logml:ssä, mikäli yksilöt inds siirretään koriin i. +% diffInCounts on poistettava COUNTS:in siivusta i1 ja lisättävä +% COUNTS:in siivuun i2, mikäli muutos toteutetaan. + +global COUNTS; global SUMCOUNTS; +global PARTITION; %global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; +global LOGDIFF; + +npops = size(COUNTS,3); +muutokset = LOGDIFF(ind,:); + +counts = COUNTS; +sumcounts = SUMCOUNTS; + +[emptyPop, pops] = findEmptyPop(npops); + +i1 = PARTITION(ind); +muutokset(i1) = 0; + +i2 = [pops(find(pops~=i1))]; +if emptyPop > 0 + i2 =[i2 emptyPop]; +end + +i2 = sort(i2); +laskematta = find(muutokset==-Inf); +i2 = intersect(i2,laskematta); + +rows = globalRows(ind,1):globalRows(ind,2); +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); +diffInSepCounts = computeDiffInCliqCounts(separators, ind); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; + +for i=i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; +end + +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) + diffInSepCounts; + +% Asetetaan muillekin tyhjille populaatioille sama muutos, kuin +% emptyPop:lle + +if emptyPop > 0 + empties = mysetdiff((1:npops), [i2 i1]); + muutokset(empties) = muutokset(emptyPop); +end + +COUNTS = counts; +SUMCOUNTS = sumcounts; +LOGDIFF(ind,:) = muutokset; + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2(i1, globalRows, ... + data, adjprior, priorTerm, logml, cliques, separators); +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mikä olisi +% muutos logml:ssä, mikäli korin i1 kaikki yksilöt siirretään +% koriin i. +% Laskee muutokset vain yhdelle tyhjälle populaatiolle, muille tulee +% muutokseksi 0. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +[emptyPop, pops] = findEmptyPop(npops); + +i2 = [pops(find(pops~=i1))]; +if emptyPop > 0 + i2 =[i2 emptyPop]; +end + +inds = find(PARTITION == i1); +ninds = length(inds); + +rows = []; +for i = 1:ninds + rows = [rows globalRows(inds(i),1):globalRows(inds(i),2)]; +end +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); +diffInCliqCounts = computeDiffInCliqCounts(cliques, inds); +diffInSepCounts = computeDiffInCliqCounts(separators, inds); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +CLIQCOUNTS(:,i1) = 0; +SEPCOUNTS(:,i1) = 0; + +for i=i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; +end + +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; +CLIQCOUNTS(:,i1) = diffInCliqCounts; +SEPCOUNTS(:,i1) = diffInSepCounts; + + + +%------------------------------------------------------------------------------------ + +function muutokset = laskeMuutokset3(T2, inds2, globalRows, ... + data, adjprior, priorTerm, i1, logml, cliques, separators) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mikä olisi muutos logml:ssä, jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirretään koriin j. +% Laskee vain yhden tyhjän populaation, muita kohden muutokseksi jää 0. + + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = []; + for i = 1:ninds + ind = inds(i); + rows = [rows; (globalRows(ind,1):globalRows(ind,2))']; + end + diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + diffInCliqCounts = computeDiffInCliqCounts(cliques, inds); + diffInSepCounts = computeDiffInCliqCounts(separators, inds); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; + SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; + + [emptyPop, pops] = findEmptyPop(npops); + i2 = [pops(find(pops~=i1))]; + if emptyPop > 0 + i2 =[i2 emptyPop]; + end + + for i = i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(pop2,i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; + end + + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) + diffInCliqCounts; + SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) + diffInSepCounts; + end +end + +%-------------------------------------------------------------------------- +function muutokset = laskeMuutokset5(inds, globalRows, data, ... + adjprior, priorTerm, logml, cliques, separators, i1, i2) + +% Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mikä olisi +% muutos logml:ssä, mikäli yksilö i vaihtaisi koria i1:n ja i2:n välillä. + +global COUNTS; global SUMCOUNTS; +global PARTITION; +global CLIQCOUNTS; global SEPCOUNTS; + +ninds = length(inds); +muutokset = zeros(ninds,1); + +for i = 1:ninds + ind = inds(i); + + rows = globalRows(ind,1):globalRows(ind,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + if PARTITION(ind)==i1 + pop1 = i1; %mistä + pop2 = i2; %mihin + else + pop1 = i2; + pop2 = i1; + end + + diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); + diffInSepCounts = computeDiffInCliqCounts(separators, ind); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)-diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts; + + CLIQCOUNTS(:,pop1) = CLIQCOUNTS(:,pop1) - diffInCliqCounts; + CLIQCOUNTS(:,pop2) = CLIQCOUNTS(:,pop2) + diffInCliqCounts; + SEPCOUNTS(:,pop1) = SEPCOUNTS(:,pop1) - diffInSepCounts; + SEPCOUNTS(:,pop2) = SEPCOUNTS(:,pop2) + diffInSepCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; + + CLIQCOUNTS(:,pop1) = CLIQCOUNTS(:,pop1) + diffInCliqCounts; + CLIQCOUNTS(:,pop2) = CLIQCOUNTS(:,pop2) - diffInCliqCounts; + SEPCOUNTS(:,pop1) = SEPCOUNTS(:,pop1) + diffInSepCounts; + SEPCOUNTS(:,pop2) = SEPCOUNTS(:,pop2) - diffInSepCounts; +end + +%-------------------------------------------------------------------------- + +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n +% riveillä rows. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + + + +%------------------------------------------------------------------------------------ + + +function popLogml = computePopulationLogml(pops, adjprior, priorTerm) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS(pops,:)),2) - priorTerm; + +%------------------------------------------------------------------------------------ + +function npops = poistaTyhjatPopulaatiot(npops) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; +global CLIQCOUNTS; +global SEPCOUNTS; +global LOGDIFF; + +notEmpty = find(any(SUMCOUNTS,2)); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); +CLIQCOUNTS = CLIQCOUNTS(:,notEmpty); +SEPCOUNTS = SEPCOUNTS(:,notEmpty); +LOGDIFF = LOGDIFF(:,notEmpty); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%---------------------------------------------------------------------------------- +%Seuraavat kolme funktiota liittyvat alkupartition muodostamiseen. + +function initial_partition=admixture_initialization(data_matrix,nclusters,Z) +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +T=cluster_own(Z,nclusters); +initial_partition=zeros(size_data(1),1); +for i=1:n + kori=T(i); + here=find(data_matrix(:,end)==i); + for j=1:length(here) + initial_partition(here(j),1)=kori; + end +end + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + + +%----------------------------------------------------------------------------------- +% Laskee arvot cliqcounts:lle ja sepcounts:lle + +function [cliqcounts, sepcounts] = computeCounts(cliques, separators, npops) + +global PARTITION; +ncliq = size(cliques,1); +nsep = size(separators,1); + +cliqPartition = zeros(ncliq, size(cliques,2)); +sepPartition = zeros(nsep, size(separators, 2)); + +apuCliq = find(cliques > 0); +apuSep = find(separators > 0); + +cliqPartition(apuCliq) = PARTITION(cliques(apuCliq)); +sepPartition(apuSep) = PARTITION(separators(apuSep)); + + +cliqcounts = zeros(ncliq, npops); +for i = 1:npops + cliqcounts(:,i) = sum(cliqPartition == i, 2); +end + + +sepcounts = zeros(nsep, npops); +for i = 1:npops + sepcounts(:,i) = sum(sepPartition == i, 2); +end + +%------------------------------------------------------------------------- + +function diffInCliqCounts = computeDiffInCliqCounts(cliques, inds) +% Laskee muutoksen CLIQCOUNTS:ssa (tai SEPCOUNTS:ssa, jos syötteenä +% separators) kun yksilöt inds siirretään. +% diffInCliqcounts on ncliq*1 taulu, joka on CLIQCOUNTS:n sarakkeesta josta +% yksilöt inds siirretään ja lisättävä sarakkeeseen, johon yksilöt +% siirretään. + +ncliq = size(cliques,1); +diffInCliqCounts = zeros(ncliq,1); +ninds = length(inds); +for i = 1:ninds + ind = inds(i); + rivit = sum((cliques == ind),2); + diffInCliqCounts = diffInCliqCounts + rivit; +end + +%----------------------------------------------------------------------- + +function [logml, spatialPrior] = computeLogml(adjprior,priorTerm) + +%global GAMMA_LN; +global CLIQCOUNTS; +global SEPCOUNTS; +global PARTITION; + +notEmpty = any(CLIQCOUNTS); +npops = length(find(notEmpty == 1)); +sumcliq=sum(CLIQCOUNTS, 2); +sumsep=sum(SEPCOUNTS, 2); +ncliq = size(CLIQCOUNTS, 1); +nsep = size(SEPCOUNTS, 1); + +cliqsizes = sum(CLIQCOUNTS, 2)'; +sepsizes = sum(SEPCOUNTS, 2)'; +cliqsizes = min([cliqsizes; npops*ones(1,ncliq)])'; +sepsizes = min([sepsizes; npops*ones(1,nsep)])'; + +klikkitn = sum(sum(gammaln(CLIQCOUNTS(:,notEmpty) + repmat(1./cliqsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./cliqsizes))) ... + - sum(gammaln(sumcliq + 1)); + +septn = sum(sum(gammaln(SEPCOUNTS(:,notEmpty) + repmat(1./sepsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./sepsizes))) ... + - sum(gammaln(sumsep + 1)); + + +%klikkitn = sum(sum(gammaln(CLIQCOUNTS + 1/npops))) ... +% - ncliq*npops*(gammaln(1/npops)) ... +% - sum(gammaln(sumcliq + 1)); +%septn = sum(sum(gammaln(SEPCOUNTS + 1/npops))) ... +% - nsep*npops*(gammaln(1/npops)) ... +% - sum(gammaln(sumsep + 1)); + +spatialPrior = (klikkitn - septn); + +%if spatialPrior > 0 +% keyboard +%end + + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = size(COUNTS,3); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 z]) + COUNTS) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS),2) - priorTerm; + +logml = sum(popLogml) + spatialPrior; + +%-------------------------------------------------------------------------- + + +function initializeGammaln(ninds, rowsFromInd, maxSize) +%Alustaa GAMMALN muuttujan s.e. GAMMALN(i,j)=gammaln((i-1) + 1/j) +global GAMMA_LN; +GAMMA_LN = zeros((1+ninds)*rowsFromInd, maxSize); +for i=1:(ninds+1)*rowsFromInd + for j=1:maxSize + GAMMA_LN(i,j)=gammaln((i-1) + 1/j); + end +end + + +%---------------------------------------------------------------------------- + + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäärä. + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + +%-------------------------------------------------------------------- + + +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) & n= 1 + suurinYks = suurinYks+1; + end + if suurinYks<10 + mjono(7) = num2str(suurinYks); + mjono(6) = 'e'; + mjono(5) = palautaYks(abs(logml),suurinYks-1); + mjono(4) = '.'; + mjono(3) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(2) = '-'; + end + elseif suurinYks>=10 + mjono(6:7) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(logml),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(1) = '-'; + end + end +end + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:inä +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmillä +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + +%-------------------------------------------------------------------------- + +function C = mysetdiff(A,B) +% MYSETDIFF Set difference of two sets of positive integers (much faster than built-in setdiff) +% C = mysetdiff(A,B) +% C = A \ B = { things in A that are not in B } +% +% Original by Kevin Murphy, modified by Leon Peshkin + +if isempty(A) + C = []; + return; +elseif isempty(B) + C = A; + return; +else % both non-empty + bits = zeros(1, max(max(A), max(B))); + bits(A) = 1; + bits(B) = 0; + C = A(logical(bits(A))); +end + + +%-------------------------------------------------------------------------- + +function logml = checkLogml(priorTerm, adjprior, cliques, separators) +% tarkistaa logml:n + +global CLIQCOUNTS; +global SEPCOUNTS; +global PARTITION; + +npops = length(unique(PARTITION)); +[cliqcounts, sepcounts] = computeCounts(cliques, separators, npops); + +CLIQCOUNTS = cliqcounts; +SEPCOUNTS = sepcounts; + + +[logml, spatialPrior] = computeLogml(adjprior, priorTerm); + +disp(['logml: ' logml2String(logml) ', spatial prior: ' logml2String(spatialPrior)]); + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhjiä +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; + +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end + + +%-------------------------------------------------------------------------- + +function [sumcounts, counts, logml] = ... + initialCounts(partition, data, npops, rowsFromInd, noalle) + +nloci=size(data,2); +ninds = size(data,1)/rowsFromInd; + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); diff --git a/matlab/spatial/spatialMix_fixK.m b/matlab/spatial/spatialMix_fixK.m new file mode 100644 index 0000000..2fb1276 --- /dev/null +++ b/matlab/spatial/spatialMix_fixK.m @@ -0,0 +1,1621 @@ +function [logml, npops, partitionSummary]=spatialMix(c,npops,nruns) +% Greedy search algorithm with fixed number of classes for spatial +% clustering. + +logml = 1; npops=1; partitionSummary=1; + +global PARTITION; global COUNTS; +global SUMCOUNTS; +global SEPCOUNTS; global CLIQCOUNTS; +global LOGDIFF; +clearGlobalVars; + +data = c.data; rowsFromInd = c.rowsFromInd; +noalle = c.noalle; adjprior = c.adjprior; priorTerm = c.priorTerm; +cliques = c.cliques; separators = c.separators; rows = c.rows; + +if isfield(c, 'dist') + dist = c.dist; Z = c.Z; +end + +clear c; + + +if nargin < 2; + npopstext = []; + teksti = {'Number of populations:', ... + 'Number of runs:'}; + def = {'20', '1'}; + + npopstextExtra = inputdlg(teksti ,... + 'Input parameters for the computation algorithm',1,def); + + if isempty(npopstextExtra) % cancel has been pressed + dispCancel + logml = 1; partitionSummary=1; npops=1; + return + end + npopstext = npopstextExtra{1}; + nrunstext = npopstextExtra{2}; + + clear teksti npopstextExtra; + if isempty(npopstext) + logml = 1; partitionSummary=1; npops=1; + return + else + npopsTable = str2num(npopstext); + npops = npopsTable(1); + + if npops==1 + logml = 1; partitionSummary=1; npops=1; + return + end + nrunsTable = str2num(nrunstext); + nruns = nrunsTable(1); + end +end + +maxnpops = npops; +logmlBest = -1e50; +partitionSummary = -1e50*ones(30,2,maxnpops); % Tiedot 30 parhaasta partitiosta (npops ja logml) +partitionSummary(:,1,:) = zeros(30,1,maxnpops); +worstLogml = -1e50*ones(1, maxnpops); worstIndex = ones(1, maxnpops); + +initData = data; +data = initData(:, 1:end-1); + +for run = 1:nruns + dispLine; + disp(['Run ' num2str(run) '/' num2str(nruns) ... + ', maximum number of populations ' num2str(npops) '.']); + disp(' '); + disp('Computing initial partition'); + + [initialPartition, counts, sumcounts] = initSpatialMultiMixture(initData, ... + npops, Z, rows, noalle, dist, adjprior, priorTerm,1); + + PARTITION = initialPartition; + [cliqcounts, sepcounts] = computeCounts(cliques, separators, npops); + + COUNTS = counts; SUMCOUNTS = sumcounts; + CLIQCOUNTS = cliqcounts; SEPCOUNTS = sepcounts; + + ninds = length(PARTITION); + %maxsize=max([max(noalle) npops]); + %initializeGammaln(ninds, rowsFromInd, maxsize); + logml = computeLogml(adjprior, priorTerm); + + LOGDIFF = repmat(-Inf,ninds,npops); + + nnotEmptyPops = length(unique(PARTITION)); + + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + + clear initialPartition; clear counts; clear sumcounts; + clear cliqcounts; clear sepcounts; + + % PARHAAN MIXTURE-PARTITION ETSIMINEN + roundTypes = [1]; %Ykkösvaiheen sykli kahteen kertaan. + nRoundTypes = 7; + kokeiltu = zeros(nRoundTypes, 1); + ready = 0; vaihe = 1; + + + disp(['Mixture analysis started with initial ' num2str(nnotEmptyPops) ' populations.']); + + while ready ~= 1 + muutoksia = 0; + + disp(['Performing steps: ' num2str(roundTypes)]); + + for n = 1:length(roundTypes) + + round = roundTypes(n); + + if kokeiltu(round) == 1 %Askelta kokeiltu viime muutoksen jälkeen + + elseif round==0 | round==1 %Yksilön siirtäminen toiseen populaatioon. + + inds = randperm(ninds); + muutosNyt = 0; + for ind = inds + i1 = PARTITION(ind); + if length(find(PARTITION==i1))>1 + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data, adjprior, priorTerm, logml, cliques, separators); + [maxMuutos, i2] = max(muutokset); + + if (i1~=i2 & maxMuutos>1e-5) + % Tapahtui muutos + muutoksia = 1; + if muutosNyt == 0 + disp('action 1'); + muutosNyt = 1; + kokeiltu = zeros(nRoundTypes,1); + end + updateGlobalVariables(ind, i2, diffInCounts,... + cliques, separators, adjprior, priorTerm); + logml = logml+maxMuutos; + + nnotEmptyPops = length(unique(PARTITION)); + + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + end + end + end + if muutosNyt == 0 + kokeiltu(round) = 1; + end + + + elseif round==2 %Populaation yhdistäminen toiseen. + maxMuutos = -1e50; + partition = PARTITION; + counts = COUNTS; + sumcounts = SUMCOUNTS; + cliqcounts = CLIQCOUNTS; + sepcounts = SEPCOUNTS; + logdiff = LOGDIFF; + + % Two populations are merged at first. + for pop = 1:npops + [muutokset, diffInCounts] = laskeMuutokset2(pop, rows, ... + data, adjprior, priorTerm, logml, cliques, separators); + muutokset(pop)=-1e50; + [isoin, indeksi] = max(muutokset); + if isoin>maxMuutos + maxMuutos = isoin; + i1 = pop; + i2 = indeksi; + diffInCountsBest = diffInCounts; + end + end + + totalMuutos = maxMuutos; + updateGlobalVariables2(i1,i2, diffInCountsBest, ... + cliques, separators, adjprior, priorTerm); + + % A new population is formed from a part of another + emptyPop = i1; + maxMuutos = -1e50; + ninds = size(rows,1); + for pop = 1:npops + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2>1 + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = linkage(dist2'); + npops2 = 2; %Moneenko osaan jaetaan + T2 = cluster_own(Z2, npops2); + muutokset = laskeMuutokset3(T2, inds2, rows, data, ... + adjprior, priorTerm, pop, logml+totalMuutos, cliques, separators); + isoin = muutokset(1,emptyPop); + if isoin>maxMuutos + maxMuutos = isoin; + muuttuvat = inds2(find(T2==1)); + end + end + end + + rivit = []; + for i = 1:length(muuttuvat) + ind = muuttuvat(i); + lisa = rows(ind,1):rows(ind,2); + rivit = [rivit; lisa']; + %rivit = [rivit; rows(ind)']; + end + diffInCounts = computeDiffInCounts(rivit', size(COUNTS,1), ... + size(COUNTS,2), data); + pop = PARTITION(muuttuvat(1)); + inds2 = find(PARTITION == pop); + + updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, emptyPop, cliques, separators); + totalMuutos = totalMuutos + maxMuutos; + + % Individuals are moved between the populations. + inds = randperm(ninds); + for ind = inds + i1 = PARTITION(ind); + if length(find(PARTITION==i1))>1 + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data, adjprior, priorTerm, logml+totalMuutos, cliques, separators); + + [maxMuutos, i2] = max(muutokset); + if (i1~=i2 & maxMuutos>1e-5) + updateGlobalVariables(ind, i2, diffInCounts,... + cliques, separators, adjprior, priorTerm); + totalMuutos = totalMuutos+maxMuutos; + end + end + end + + if totalMuutos>1e-5 + muutoksia = 1; + kokeiltu = zeros(nRoundTypes,1); + logml = logml + totalMuutos; + nnotEmptyPops = length(unique(PARTITION)); + disp('Action 2'); + + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + + else + PARTITION = partition; + SUMCOUNTS = sumcounts; + COUNTS = counts; + CLIQCOUNTS = cliqcounts; + SEPCOUNTS = sepcounts; + LOGDIFF = logdiff; + kokeiltu(round) = 1; + end + + elseif round==3 | round==4 % Splitting a population + maxMuutos = 0; + for pop = 1:npops + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2>5 + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = linkage(dist2'); + if round==3 + npops2 = max(min(20, floor(ninds2/2)),2); %Moneenko osaan jaetaan + elseif round==4 + npops2 = 2; + end + T2 = cluster_own(Z2, npops2); + + muutokset = laskeMuutokset3(T2, inds2, rows, data, ...7 + adjprior, priorTerm, pop, logml, cliques, separators); + [isoin, indeksi] = max(muutokset(1:end)); + if isoin>maxMuutos + maxMuutos = isoin; + muuttuvaPop2 = rem(indeksi,npops2); + if muuttuvaPop2==0, muuttuvaPop2 = npops2; end + muuttuvat = inds2(find(T2==muuttuvaPop2)); + i2 = ceil(indeksi/npops2); + end + end + end + if maxMuutos>1e-5 + muutoksia = 1; + if round==3 + disp('action 3'); + else + disp('action 4'); + end + kokeiltu = zeros(nRoundTypes,1); + rivit = []; + for i = 1:length(muuttuvat) + ind = muuttuvat(i); + lisa = rows(ind,1):rows(ind,2); + rivit = [rivit lisa]; + end + diffInCounts = computeDiffInCounts(rivit, size(COUNTS,1), ... + size(COUNTS,2), data); + i1 = PARTITION(muuttuvat(1)); + updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, i2, cliques, separators); + logml = logml + maxMuutos; + + nnotEmptyPops = length(unique(PARTITION)); + + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); end + end + else + kokeiltu(round)=1; + end + + elseif round == 5 | round == 6 + j=0; + muutettu = 0; + %poplogml = POP_LOGML; + partition = PARTITION; + counts = COUNTS; + sumcounts = SUMCOUNTS; + cliqcounts = CLIQCOUNTS; + sepcounts = SEPCOUNTS; + logdiff = LOGDIFF; + + pops = randperm(npops); + while (j < npops & muutettu == 0) + j = j+1; + pop = pops(j); + totalMuutos = 0; + inds = find(PARTITION==pop); + if round == 5 + aputaulu = [inds rand(length(inds),1)]; + aputaulu = sortrows(aputaulu,2); + inds = aputaulu(:,1)'; + elseif round == 6 + inds = returnInOrder(inds, pop, ... + rows, data, adjprior, priorTerm); + end + + i=0; + + while (length(inds) > 0 & i < length(inds) - 1) % last ind is not removed from population + i = i+1; + ind =inds(i); + + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data, adjprior, priorTerm, logml, cliques, separators); + muutokset(pop) = -1e50; % Varmasti ei suurin!!! + [maxMuutos, i2] = max(muutokset); + + updateGlobalVariables(ind, i2, diffInCounts, ... + cliques, separators, adjprior, priorTerm); + + totalMuutos = totalMuutos+maxMuutos; + logml = logml+maxMuutos; + if round == 6 + % Lopetetaan heti kun muutos on positiivinen. + if totalMuutos > 1e-5 + i=length(inds); + end + end + end + + if totalMuutos>1e-5 + if round == 5 + disp('action 5'); + elseif round == 6 + disp('action 6'); + end + kokeiltu = zeros(nRoundTypes,1); + muutettu=1; + muutoksia = 1; % Ulompi kirjanpito. + nnotEmptyPops = length(unique(PARTITION)); + + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); + end + end + else + % Missään vaiheessa tila ei parantunut. + % Perutaan kaikki muutokset. + PARTITION = partition; + SUMCOUNTS = sumcounts; + %POP_LOGML = poplogml; + COUNTS = counts; + logml = logml - totalMuutos; + CLIQCOUNTS = cliqcounts; + SEPCOUNTS = sepcounts; + LOGDIFF = logdiff; + kokeiltu(round)=1; + end + end + clear partition; clear sumcounts; clear counts; + clear cliqcounts; clear sepcounts; + + elseif round == 7 % A new population is formed and merged into another + emptyPop = npops + 1; + j = 0; + pops = randperm(npops); + muutoksiaNyt = 0; + while (j < npops) + j = j +1; + pop = pops(j); + inds2 = find(PARTITION==pop); + ninds2 = length(inds2); + if ninds2 > 5 + partition = PARTITION; + sumcounts = SUMCOUNTS; + counts = COUNTS; + cliqcounts = CLIQCOUNTS; + sepcounts = SEPCOUNTS; + oldLogml = logml; + logdiff = LOGDIFF; + + % a new population is formed temporarily + npops = npops + 1; + POP_LOGML(npops) = 0; + COUNTS(:,:,npops) = zeros(size(COUNTS(:,:,1))); + SUMCOUNTS(npops,:) = zeros(size(SUMCOUNTS(1,:))); + CLIQCOUNTS(:,npops) = zeros(size(CLIQCOUNTS(:,1))); + SEPCOUNTS(:,npops) = zeros(size(SEPCOUNTS(:,1))); + + dist2 = laskeOsaDist(inds2, dist, ninds); + Z2 = linkage(dist2'); + T2 = cluster_own(Z2, 2); + muuttuvat = inds2(find(T2==1)); + + rivit = []; + for i = 1:length(muuttuvat) + ind = muuttuvat(i); + lisa = rows(ind,1):rows(ind,2); + rivit = [rivit lisa]; + end + diffInCounts = computeDiffInCounts(rivit, size(COUNTS,1), ... + size(COUNTS,2), data); + updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, emptyPop, cliques, separators); + + logml = computeLogml(adjprior, priorTerm); + + % Individuals are moved between the populations. + inds = randperm(ninds); + for ind = inds + i1 = PARTITION(ind); + if length(find(PARTITION==i1))>1 + [muutokset, diffInCounts] = laskeMuutokset(ind, rows, ... + data, adjprior, priorTerm, logml, cliques, separators); + + [maxMuutos, i2] = max(muutokset); + if (i1~=i2 & maxMuutos>1e-5) + updateGlobalVariables(ind, i2, diffInCounts,... + cliques, separators, adjprior, priorTerm); + logml = logml + maxMuutos; + end + end + end + + % Merge two populations if the number of + % populations is too big. + if length(find(any(SUMCOUNTS,2))) == npops + maxMuutos = -1e50; + for pop = 1:npops + [muutokset, diffInCounts] = laskeMuutokset2(pop, rows, ... + data, adjprior, priorTerm, logml, cliques, separators); + muutokset(pop)=-1e50; + [isoin, indeksi] = max(muutokset); + if isoin>maxMuutos + maxMuutos = isoin; + i1 = pop; + i2 = indeksi; + diffInCountsBest = diffInCounts; + end + end + updateGlobalVariables2(i1,i2, diffInCountsBest, ... + cliques, separators, adjprior, priorTerm); + logml = logml + maxMuutos; + end + + if logml > oldLogml + 1e-5 + muutoksia = 1; + npops = poistaTyhjatPopulaatiot(npops); + nnotEmptyPops = length(unique(PARTITION)); + + if logml>worstLogml(nnotEmptyPops); + [partitionSummary(:,:,nnotEmptyPops), added] = addToSummary(logml, ... + partitionSummary(:,:,nnotEmptyPops), worstIndex(nnotEmptyPops)); + if (added==1) + [worstLogml(nnotEmptyPops), worstIndex(nnotEmptyPops)] = ... + min(partitionSummary(:,2,nnotEmptyPops)); end + end + if muutoksiaNyt == 0 + disp('action 7'); + muutoksiaNyt = 1; + end + kokeiltu = zeros(nRoundTypes,1); + j = npops; % Lopetetaan, koska muutos syntyi + else + %palutetaan vanhat arvot + PARTITION = partition; + SUMCOUNTS = sumcounts; + COUNTS = counts; + CLIQCOUNTS = cliqcounts; + SEPCOUNTS = sepcounts; + LOGDIFF = logdiff; + npops = npops-1; + logml = oldLogml; + if abs(logml-computeLogml(adjprior, priorTerm))>1e-5 + keyboard + end + end + + end + + end + + if muutoksiaNyt == 0 + kokeiltu(round)=1; + end + + end + + end + + + if muutoksia == 0 + if vaihe==1 + vaihe = 2; + elseif vaihe==2 + vaihe = 3; + elseif vaihe==3 + vaihe = 4; + elseif vaihe==4; + vaihe = 5; + elseif vaihe==5 + ready = 1; + end + else + muutoksia = 0; + end + + if ready==0 + if vaihe==1 + roundTypes=[1]; + elseif vaihe==2 + roundTypes=[2 1]; + elseif vaihe==3 + roundTypes=[5 5 7]; + elseif vaihe==4 + roundTypes=[4 3 1]; + elseif vaihe==5 + roundTypes=[6 2 7 3 4 1]; + end + end + end + + % TALLENNETAAN + + npops = poistaTyhjatPopulaatiot(npops); + %POP_LOGML = computePopulationLogml(1:npops, adjprior, priorTerm); + disp(['Found partition with ' num2str(npops) ' populations.']); + disp(['Log(ml) = ' num2str(logml)]); + disp(' '); + + + + if logml>logmlBest + % Päivitetään parasta löydettyä partitiota. + logmlBest = logml; + npopsBest = npops; + partitionBest = PARTITION; + countsBest = COUNTS; + sumCountsBest = SUMCOUNTS; + %pop_logmlBest = POP_LOGML; + cliqCountsBest = CLIQCOUNTS; + sepCountsBest = SEPCOUNTS; + logdiffbest = LOGDIFF; + end +end + +logml = logmlBest; +npops = npopsBest; +PARTITION = partitionBest; +COUNTS = countsBest; +SUMCOUNTS = sumCountsBest; +%POP_LOGML = pop_logmlBest; +CLIQCOUNTS = cliqCountsBest; +SEPCOUNTS = sepCountsBest; +LOGDIFF = logdiffbest; + +%------------------------------------------------------------------------------------- + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +%global POP_LOGML; POP_LOGML = []; +global SEPCOUNTS; SEPCOUNTS = []; +global CLIQCOUNTS; CLIQCOUNTS = []; +global LOGDIFF; LOGDIFF = []; + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, että annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ssä ei vielä ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyistä partitiota vastaava nclusters:in arvo. Muutoin ei tehdä mitään. + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole vielä kirjattuna summaryyn. + global PARTITION; + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + + +%-------------------------------------------------------------------------- + + +function [suurin, i2] = arvoSeuraavaTila(muutokset, logml) +% Suorittaa yksilön seuraavan tilan arvonnan + +y = logml + muutokset; % siirron jälkeiset logml:t +y = y - max(y); +y = exp(y); +summa = sum(y); +y = y/summa; +y = cumsum(y); + +i2 = rand_disc(y); % uusi kori +suurin = muutokset(i2); + + +%-------------------------------------------------------------------------------------- + + +function svar=rand_disc(CDF) +%returns an index of a value from a discrete distribution using inversion method +slump=rand; +har=find(CDF>slump); +svar=har(1); + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, diffInCounts, ... + cliques, separators, adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun yksilö ind +% siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global CLIQCOUNTS; +global SEPCOUNTS; +global LOGDIFF; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); +diffInSepCounts = computeDiffInCliqCounts(separators, ind); + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + +%POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2(i1, i2, diffInCounts, ... + cliques, separators, adjprior, priorTerm); +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksilöt siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +%global POP_LOGML; +global CLIQCOUNTS; +global SEPCOUNTS; +global LOGDIFF; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +diffInCliqCounts = CLIQCOUNTS(:,i1); +diffInSepCounts = SEPCOUNTS(:,i1); + + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = 0; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = 0; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, i2, cliques, separators); +% Suorittaa globaalien muuttujien päivitykset, kun yksilöt 'muuttuvat' +% siirretään koriin i2. Ennen siirtoa yksilöiden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; global CLIQCOUNTS; +global SUMCOUNTS; global SEPCOUNTS; +global LOGDIFF; +%global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +diffInCliqCounts = computeDiffInCliqCounts(cliques, muuttuvat); +diffInSepCounts = computeDiffInCliqCounts(separators, muuttuvat); + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + +LOGDIFF(:,[i1 i2]) = -Inf; +inx = [find(PARTITION==i1); find(PARTITION==i2)]; +LOGDIFF(inx,:) = -Inf; + +%POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, globalRows, data, ... + adjprior, priorTerm) +% Palauttaa yksilöt järjestyksessä siten, että ensimmäisenä on +% se, jonka poistaminen populaatiosta pop nostaisi logml:n +% arvoa eniten. + +global COUNTS; global SUMCOUNTS; +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind =inds(i); + rows = globalRows(i,1):globalRows(i,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop) = COUNTS(:,:,pop)-diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)-diffInSumCounts; + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior, priorTerm); + COUNTS(:,:,pop) = COUNTS(:,:,pop)+diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)+diffInSumCounts; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset(ind, globalRows, ... + data, adjprior, priorTerm, logml, cliques, separators) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mikä olisi +% muutos logml:ssä, mikäli yksilöt inds siirretään koriin i. +% diffInCounts on poistettava COUNTS:in siivusta i1 ja lisättävä +% COUNTS:in siivuun i2, mikäli muutos toteutetaan. + +global COUNTS; global SUMCOUNTS; +global PARTITION; %global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; +global LOGDIFF; + +npops = size(COUNTS,3); +muutokset = LOGDIFF(ind,:); + +counts = COUNTS; +sumcounts = SUMCOUNTS; + +[emptyPop, pops] = findEmptyPop(npops); + +i1 = PARTITION(ind); +muutokset(i1) = 0; + +i2 = [pops(find(pops~=i1))]; +if emptyPop > 0 + i2 =[i2 emptyPop]; +end + +i2 = sort(i2); +laskematta = find(muutokset==-Inf); +i2 = intersect(i2,laskematta); + +rows = globalRows(ind,1):globalRows(ind,2); +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); +diffInSepCounts = computeDiffInCliqCounts(separators, ind); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; + +for i=i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; +end + +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) + diffInSepCounts; + +% Asetetaan muillekin tyhjille populaatioille sama muutos, kuin +% emptyPop:lle + +if emptyPop > 0 + empties = mysetdiff((1:npops), [i2 i1]); + muutokset(empties) = muutokset(emptyPop); +end + +COUNTS = counts; +SUMCOUNTS = sumcounts; +LOGDIFF(ind,:) = muutokset; + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2(i1, globalRows, ... + data, adjprior, priorTerm, logml, cliques, separators); +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mikä olisi +% muutos logml:ssä, mikäli korin i1 kaikki yksilöt siirretään +% koriin i. +% Laskee muutokset vain yhdelle tyhjälle populaatiolle, muille tulee +% muutokseksi 0. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +[emptyPop, pops] = findEmptyPop(npops); + +i2 = [pops(find(pops~=i1))]; +if emptyPop > 0 + i2 =[i2 emptyPop]; +end + +inds = find(PARTITION == i1); +ninds = length(inds); + +rows = []; +for i = 1:ninds + rows = [rows globalRows(inds(i),1):globalRows(inds(i),2)]; +end +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); +diffInCliqCounts = computeDiffInCliqCounts(cliques, inds); +diffInSepCounts = computeDiffInCliqCounts(separators, inds); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +CLIQCOUNTS(:,i1) = 0; +SEPCOUNTS(:,i1) = 0; + +for i=i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; +end + +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; +CLIQCOUNTS(:,i1) = diffInCliqCounts; +SEPCOUNTS(:,i1) = diffInSepCounts; + + + +%------------------------------------------------------------------------------------ + +function muutokset = laskeMuutokset3(T2, inds2, globalRows, ... + data, adjprior, priorTerm, i1, logml, cliques, separators) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mikä olisi muutos logml:ssä, jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirretään koriin j. +% Laskee vain yhden tyhjän populaation, muita kohden muutokseksi jää 0. + + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = []; + for i = 1:ninds + ind = inds(i); + rows = [rows; (globalRows(ind,1):globalRows(ind,2))']; + end + diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + diffInCliqCounts = computeDiffInCliqCounts(cliques, inds); + diffInSepCounts = computeDiffInCliqCounts(separators, inds); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; + SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; + + [emptyPop, pops] = findEmptyPop(npops); + i2 = [pops(find(pops~=i1))]; + if emptyPop > 0 + i2 =[i2 emptyPop]; + end + + for i = i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(pop2,i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; + end + + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) + diffInCliqCounts; + SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) + diffInSepCounts; + end +end + +%-------------------------------------------------------------------------- +function muutokset = laskeMuutokset5(inds, globalRows, data, ... + adjprior, priorTerm, logml, cliques, separators, i1, i2) + +% Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mikä olisi +% muutos logml:ssä, mikäli yksilö i vaihtaisi koria i1:n ja i2:n välillä. + +global COUNTS; global SUMCOUNTS; +global PARTITION; +global CLIQCOUNTS; global SEPCOUNTS; + +ninds = length(inds); +muutokset = zeros(ninds,1); + +for i = 1:ninds + ind = inds(i); + + rows = globalRows(ind,1):globalRows(ind,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + if PARTITION(ind)==i1 + pop1 = i1; %mistä + pop2 = i2; %mihin + else + pop1 = i2; + pop2 = i1; + end + + diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); + diffInSepCounts = computeDiffInCliqCounts(separators, ind); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)-diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts; + + CLIQCOUNTS(:,pop1) = CLIQCOUNTS(:,pop1) - diffInCliqCounts; + CLIQCOUNTS(:,pop2) = CLIQCOUNTS(:,pop2) + diffInCliqCounts; + SEPCOUNTS(:,pop1) = SEPCOUNTS(:,pop1) - diffInSepCounts; + SEPCOUNTS(:,pop2) = SEPCOUNTS(:,pop2) + diffInSepCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; + + CLIQCOUNTS(:,pop1) = CLIQCOUNTS(:,pop1) + diffInCliqCounts; + CLIQCOUNTS(:,pop2) = CLIQCOUNTS(:,pop2) - diffInCliqCounts; + SEPCOUNTS(:,pop1) = SEPCOUNTS(:,pop1) + diffInSepCounts; + SEPCOUNTS(:,pop2) = SEPCOUNTS(:,pop2) - diffInSepCounts; +end + +%-------------------------------------------------------------------------- + +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n +% riveillä rows. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + + + +%------------------------------------------------------------------------------------ + + +function popLogml = computePopulationLogml(pops, adjprior, priorTerm) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS(pops,:)),2) - priorTerm; + +%------------------------------------------------------------------------------------ + +function npops = poistaTyhjatPopulaatiot(npops) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; +global CLIQCOUNTS; +global SEPCOUNTS; +global LOGDIFF; + +notEmpty = find(any(SUMCOUNTS,2)); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); +CLIQCOUNTS = CLIQCOUNTS(:,notEmpty); +SEPCOUNTS = SEPCOUNTS(:,notEmpty); +LOGDIFF = LOGDIFF(:,notEmpty); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%---------------------------------------------------------------------------------- +%Seuraavat kolme funktiota liittyvat alkupartition muodostamiseen. + +function initial_partition=admixture_initialization(data_matrix,nclusters,Z) +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +T=cluster_own(Z,nclusters); +initial_partition=zeros(size_data(1),1); +for i=1:n + kori=T(i); + here=find(data_matrix(:,end)==i); + for j=1:length(here) + initial_partition(here(j),1)=kori; + end +end + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + + +%----------------------------------------------------------------------------------- +% Laskee arvot cliqcounts:lle ja sepcounts:lle + +function [cliqcounts, sepcounts] = computeCounts(cliques, separators, npops) + +global PARTITION; +ncliq = size(cliques,1); +nsep = size(separators,1); + +cliqPartition = zeros(ncliq, size(cliques,2)); +sepPartition = zeros(nsep, size(separators, 2)); + +apuCliq = find(cliques > 0); +apuSep = find(separators > 0); + +cliqPartition(apuCliq) = PARTITION(cliques(apuCliq)); +sepPartition(apuSep) = PARTITION(separators(apuSep)); + + +cliqcounts = zeros(ncliq, npops); +for i = 1:npops + cliqcounts(:,i) = sum(cliqPartition == i, 2); +end + + +sepcounts = zeros(nsep, npops); +for i = 1:npops + sepcounts(:,i) = sum(sepPartition == i, 2); +end + +%------------------------------------------------------------------------- + +function diffInCliqCounts = computeDiffInCliqCounts(cliques, inds) +% Laskee muutoksen CLIQCOUNTS:ssa (tai SEPCOUNTS:ssa, jos syötteenä +% separators) kun yksilöt inds siirretään. +% diffInCliqcounts on ncliq*1 taulu, joka on CLIQCOUNTS:n sarakkeesta josta +% yksilöt inds siirretään ja lisättävä sarakkeeseen, johon yksilöt +% siirretään. + +ncliq = size(cliques,1); +diffInCliqCounts = zeros(ncliq,1); +ninds = length(inds); +for i = 1:ninds + ind = inds(i); + rivit = sum((cliques == ind),2); + diffInCliqCounts = diffInCliqCounts + rivit; +end + +%----------------------------------------------------------------------- + +function [logml, spatialPrior] = computeLogml(adjprior,priorTerm,cliqcounts,sepcounts) + +%global GAMMA_LN; +global CLIQCOUNTS; +global SEPCOUNTS; +global PARTITION; + +if nargin < 3 + + notEmpty = any(CLIQCOUNTS); + npops = length(find(notEmpty == 1)); + sumcliq=sum(CLIQCOUNTS, 2); + sumsep=sum(SEPCOUNTS, 2); + ncliq = size(CLIQCOUNTS, 1); + nsep = size(SEPCOUNTS, 1); + + cliqsizes = sum(CLIQCOUNTS, 2)'; + sepsizes = sum(SEPCOUNTS, 2)'; + cliqsizes = min([cliqsizes; npops*ones(1,ncliq)])'; + sepsizes = min([sepsizes; npops*ones(1,nsep)])'; + + klikkitn = sum(sum(gammaln(CLIQCOUNTS(:,notEmpty) + repmat(1./cliqsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./cliqsizes))) ... + - sum(gammaln(sumcliq + 1)); + + septn = sum(sum(gammaln(SEPCOUNTS(:,notEmpty) + repmat(1./sepsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./sepsizes))) ... + - sum(gammaln(sumsep + 1)); + +else + notEmpty = any(cliqcounts); + npops = length(find(notEmpty == 1)); + sumcliq=sum(cliqcounts, 2); + sumsep=sum(sepcounts, 2); + ncliq = size(cliqcounts, 1); + nsep = size(sepcounts, 1); + + cliqsizes = sum(cliqcounts, 2)'; + sepsizes = sum(sepcounts, 2)'; + cliqsizes = min([cliqsizes; npops*ones(1,ncliq)])'; + sepsizes = min([sepsizes; npops*ones(1,nsep)])'; + + klikkitn = sum(sum(gammaln(cliqcounts(:,notEmpty) + repmat(1./cliqsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./cliqsizes))) ... + - sum(gammaln(sumcliq + 1)); + + septn = sum(sum(gammaln(sepcounts(:,notEmpty) + repmat(1./sepsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./sepsizes))) ... + - sum(gammaln(sumsep + 1)); +end + + + +%klikkitn = sum(sum(gammaln(CLIQCOUNTS + 1/npops))) ... +% - ncliq*npops*(gammaln(1/npops)) ... +% - sum(gammaln(sumcliq + 1)); +%septn = sum(sum(gammaln(SEPCOUNTS + 1/npops))) ... +% - nsep*npops*(gammaln(1/npops)) ... +% - sum(gammaln(sumsep + 1)); + +spatialPrior = (klikkitn - septn); + +%if spatialPrior > 0 +% keyboard +%end + + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = size(COUNTS,3); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 z]) + COUNTS) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS),2) - priorTerm; + +logml = sum(popLogml) + spatialPrior; + +%-------------------------------------------------------------------------- + + +function initializeGammaln(ninds, rowsFromInd, maxSize) +%Alustaa GAMMALN muuttujan s.e. GAMMALN(i,j)=gammaln((i-1) + 1/j) +global GAMMA_LN; +GAMMA_LN = zeros((1+ninds)*rowsFromInd, maxSize); +for i=1:(ninds+1)*rowsFromInd + for j=1:maxSize + GAMMA_LN(i,j)=gammaln((i-1) + 1/j); + end +end + + +%---------------------------------------------------------------------------- + + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäärä. + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + +%-------------------------------------------------------------------- + + +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) & n= 1 + suurinYks = suurinYks+1; + end + if suurinYks<10 + mjono(7) = num2str(suurinYks); + mjono(6) = 'e'; + mjono(5) = palautaYks(abs(logml),suurinYks-1); + mjono(4) = '.'; + mjono(3) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(2) = '-'; + end + elseif suurinYks>=10 + mjono(6:7) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(logml),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(logml),suurinYks); + if logml<0 + mjono(1) = '-'; + end + end +end + +function digit = palautaYks(num,yks) +% palauttaa luvun num 10^yks termin kertoimen +% string:inä +% yks täytyy olla kokonaisluku, joka on +% vähintään -1:n suuruinen. Pienemmillä +% luvuilla tapahtuu jokin pyöristysvirhe. + +if yks>=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + +%-------------------------------------------------------------------------- + +function C = mysetdiff(A,B) +% MYSETDIFF Set difference of two sets of positive integers (much faster than built-in setdiff) +% C = mysetdiff(A,B) +% C = A \ B = { things in A that are not in B } +% +% Original by Kevin Murphy, modified by Leon Peshkin + +if isempty(A) + C = []; + return; +elseif isempty(B) + C = A; + return; +else % both non-empty + bits = zeros(1, max(max(A), max(B))); + bits(A) = 1; + bits(B) = 0; + C = A(logical(bits(A))); +end + + +%-------------------------------------------------------------------------- + +function logml = checkLogml(priorTerm, adjprior, cliques, separators) +% tarkistaa logml:n + +global CLIQCOUNTS; +global SEPCOUNTS; +global PARTITION; + +npops = length(unique(PARTITION)); +[cliqcounts, sepcounts] = computeCounts(cliques, separators, npops); + +%CLIQCOUNTS = cliqcounts; +%SEPCOUNTS = sepcounts; + + +[logml, spatialPrior] = computeLogml(adjprior, priorTerm,cliqcounts,sepcounts); + +disp(['logml: ' logml2String(logml) ', spatial prior: ' logml2String(spatialPrior)]); + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhjiä +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; + +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end + + +%-------------------------------------------------------------------------- + +function [sumcounts, counts, logml] = ... + initialCounts(partition, data, npops, rowsFromInd, noalle) + +nloci=size(data,2); +ninds = size(data,1)/rowsFromInd; + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); diff --git a/matlab/spatial/spatialMixture.m b/matlab/spatial/spatialMixture.m new file mode 100644 index 0000000..79afd5a --- /dev/null +++ b/matlab/spatial/spatialMixture.m @@ -0,0 +1,2609 @@ +function spatialMixture +%Vaihtuvalla populaatioiden määräll? priori 3:lla +base = findobj('Tag','base_figure'); % added by jing 21.11.06 + +% check whether fixed k mode is selected +h0 = findobj('Tag','fixk_menu'); +fixedK = get(h0, 'userdata'); + +if fixedK + if ~(fixKWarning == 1) % call function fixKWarning + return + end +end + +% output file name +OUTPUT_FILE = 'baps6_output.baps'; % also remember to update the file name in function WriteMixtureInfo + +% check whether partition compare mode is selected +h1 = findobj('Tag','partitioncompare_menu'); +partitionCompare = get(h1, 'userdata'); + +formatList = {'BAPS-format','FASTA-format', 'GenePop-format', 'Preprocessed data'}; +formatChoice = menu('Specify the format of your data: ','BAPS-format','FASTA-format', 'GenePop-format', 'Preprocessed data'); +if formatChoice==0 + return; +else + input_type = formatList{formatChoice}; +end + +switch input_type + +case 'BAPS-format' + %waitALittle; + setWindowOnTop(base,'false') + [filename1, pathname1] = uigetfile('*.txt', 'Load data in BAPS-format'); + if filename1==0 + return; + end + + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname filename]); + end + data = load([pathname1 filename1]); + ninds = testaaOnkoKunnollinenBapsData(data); %TESTAUS + if (ninds==0) + disp('Incorrect Data-file.'); + return; + end + + %waitALittle; + setWindowOnTop(base,'false') + [filename2,pathname2]=uigetfile('*.txt', 'Load individual coordinates'); + if filename2==0 + return + end + + coordinates = load([pathname2 filename2]); + % viallinen = testaaKoordinaatit(ninds, coordinates); + [viallinen coordinates] = testaaKoordinaatit(ninds, coordinates); % added by Lu Cheng, 05.12.2012 + if viallinen + disp('Incorrect coordinates'); + return + end + + inp = [filename1 ' & ' filename2]; + h0 = findobj('Tag','filename1_text'); + set(h0,'String',inp); + clear h0; clear inp; + clear filename1; clear filename2; clear pathname1; clear pathname2; + + input_pops = questdlg(['When using data which are in BAPS-format, '... + 'you can specify the sampling populations of the individuals by '... + 'giving two additional files: one containing the names of the '... + 'populations, the other containing the indices of the first '... + 'individuals of the populations. Do you wish to specify the '... + 'sampling populations?'], ... + 'Specify sampling populations?',... + 'Yes', 'No', 'No'); + if isequal(input_pops,'Yes') + %waitALittle; + setWindowOnTop(base,'false') + [namefile, namepath] = uigetfile('*.txt', 'Load population names'); + if namefile==0 + kysyToinen = 0; + else + kysyToinen = 1; + end + if kysyToinen==1 + %waitALittle; + setWindowOnTop(base,'false') + [indicesfile, indicespath] = uigetfile('*.txt', 'Load population indices'); + if indicesfile==0 + popnames = []; + else + popnames = initPopNames([namepath namefile],[indicespath indicesfile]); + end + else + popnames = []; + end + else + popnames = []; + end + + disp('Pre-processing the data. This may take several minutes.'); + + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + [Z,dist] = newGetDistances(data,rowsFromInd); + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + + save_preproc = questdlg('Do you wish to save pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + kokonimi = [pathname filename]; + c.data = data; c.rowsFromInd = rowsFromInd; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.dist = dist; c.popnames = popnames; c.Z = Z; + c.cliques = cliques; c.separators = separators; + c.vorPoints = vorPoints; c.vorCells = vorCells; + c.pointers = pointers; c.coordinates = coordinates; +% save(kokonimi,'c'); + save(kokonimi,'c', '-v7.3'); % added by Lu Cheng, 08.06.2012 + clear c; + end; + +%%%%%%%%%%%%% added by Lu Cheng 11.11.2012 START %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +case 'FASTA-format' + + setWindowOnTop(base,'false') + [filename1, pathname1] = uigetfile({'*.fasta';'*.*'}, 'Load data in FASTA-format'); + if filename1==0 + return; + end + + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname filename]); + end + %data = load([pathname1 filename1]); + [heds, seqs] = fastaread([pathname1 filename1]); + seqs = seqs(:); + alnMat = cell2mat(seqs); + nSeq = length(seqs); + clear seqs; + + cc = preprocAln(alnMat); + cc.heds = heds; + cc.nSeq = nSeq; + dist = seqpdist(alnMat,'method','p-distance'); + Z = linkage(dist,'complete'); + + ninds = nSeq; + + clear alnMat heds nSeq + +% [ninds,data,heds]=testFastaData([pathname1 filename1]); + + setWindowOnTop(base,'false') + [filename2,pathname2]=uigetfile('*.txt', 'Load individual coordinates'); + if filename2==0 + return + end + + coordinates = load([pathname2 filename2]); + %viallinen = testaaKoordinaatit(ninds, coordinates); + [viallinen coordinates] = testaaKoordinaatit(ninds, coordinates); % added by Lu Cheng, 05.12.2012 + if viallinen + disp('Incorrect coordinates'); + return + end + + inp = [filename1 ' & ' filename2]; + h0 = findobj('Tag','filename1_text'); + set(h0,'String',inp); + clear h0; clear inp; + clear filename1; clear filename2; clear pathname1; clear pathname2; + + input_pops = questdlg(['When using data which are in FASTA-format, '... + 'you can specify the sampling populations of the individuals by '... + 'giving two additional files: one containing the names of the '... + 'populations, the other containing the indices of the first '... + 'individuals of the populations. Do you wish to specify the '... + 'sampling populations?'], ... + 'Specify sampling populations?',... + 'Yes', 'No', 'No'); +% input_pops = 'No'; + if isequal(input_pops,'Yes') + %waitALittle; + setWindowOnTop(base,'false') + [namefile, namepath] = uigetfile('*.txt', 'Load population names'); + if namefile==0 + kysyToinen = 0; + else + kysyToinen = 1; + end + if kysyToinen==1 + %waitALittle; + setWindowOnTop(base,'false') + [indicesfile, indicespath] = uigetfile('*.txt', 'Load population indices'); + if indicesfile==0 + popnames = []; + else + popnames = initPopNames([namepath namefile],[indicespath indicesfile]); + end + else + popnames = []; + end + else + popnames = []; + end + + disp('Pre-processing the data. This may take several minutes.'); + +% [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); +% [Z,dist] = newGetDistances(data,rowsFromInd); + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + + cc.locCliques = cliques; + cc.locSeparators = separators; + cc.popnames = popnames; + cc.vorPoints = vorPoints; + cc.vorCells = vorCells; + cc.pointers = pointers; + cc.coordinates = coordinates; + format_type = 'FASTA'; + + save_preproc = questdlg('Do you wish to save pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + kokonimi = [pathname filename]; + save(kokonimi,'cc','dist','Z','format_type','-v7.3'); % added by Lu Cheng, 08.06.2012 + end; + + handleIndiFastaCase(cc,dist,Z); + + return; +%%%%%%%%%%%%%add by Lu Cheng 11.11.2012 END %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +case 'GenePop-format' + %%waitALittle; + setWindowOnTop(base,'false') + [filename1, pathname1] = uigetfile('*.txt', 'Load data in GenePop-format'); + if filename1==0 + return; + end + + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname filename]); + end + + kunnossa = testaaGenePopData([pathname1 filename1]); + if kunnossa==0 + return + end + [data,popnames]=lueGenePopData([pathname1 filename1]); + + %%waitALittle; + setWindowOnTop(base,'false') + [filename2,pathname2]=uigetfile('*.txt', 'Load individual coordinates'); + if filename2==0 + return + end + + ninds = max(data(:,end)); + coordinates = load([pathname2 filename2]); + %viallinen = testaaKoordinaatit(ninds, coordinates); + [viallinen coordinates] = testaaKoordinaatit(ninds, coordinates); % added by Lu Cheng, 05.12.2012 + + if viallinen + disp('Incorrect coordinates'); + return + end + + inp = [filename1 ' & ' filename2]; + h0 = findobj('Tag','filename1_text'); + set(h0,'String',inp); + clear h0; clear inp; + clear filename1; clear filename2; clear pathname1; clear pathname2; + + disp('Pre-processing the data. This may take several minutes.'); + + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + [data, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + [Z,dist] = newGetDistances(data,rowsFromInd); + + save_preproc = questdlg('Do you wish to save pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %%waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + kokonimi = [pathname filename]; + c.data = data; c.rowsFromInd = rowsFromInd; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.dist = dist; c.popnames = popnames; c.Z = Z; + c.cliques = cliques; c.separators = separators; + c.vorPoints = vorPoints; c.vorCells = vorCells; + c.pointers = pointers; c.coordinates = coordinates; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + clear c; + end; + +case 'Preprocessed data' + %%waitALittle; +% setWindowOnTop(base,'false') + [filename, pathname] = uigetfile('*.mat', 'Load pre-processed data'); + if filename==0 + return; + end + display('---------------------------------------------------'); + display(['Reading preprocessed spatial data from: ',[pathname filename],'...']); + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + + struct_array = load([pathname filename]); + + if isfield(struct_array,'format_type') && strcmp(struct_array.format_type,'FASTA') + handleIndiFastaCase(struct_array.cc,struct_array.dist,struct_array.Z); + return; + end + + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'dist') + disp('Incorrect file format'); + return + end + elseif isfield(struct_array,'dist') %Mideva versio + c = struct_array; + else + disp('Incorrect file format'); + return; + end + data = double(c.data); rowsFromInd = c.rowsFromInd; alleleCodes = c.alleleCodes; + noalle = c.noalle; adjprior = c.adjprior; priorTerm = c.priorTerm; + dist = c.dist; popnames = c.popnames; Z = c.Z; + if isfield(c, 'cliques') + cliques = c.cliques; separators = c.separators; + vorPoints = c.vorPoints; vorCells = c.vorCells; + pointers = c.pointers; coordinates = c.coordinates; + clear c; + else + load_coord = questdlg(['The data file did not contain ',... + 'coordinate information. Do you wish to load coordinates?'], ... + 'Load coordinates?',... + 'Yes','No','Yes'); + if isequal(load_coord, 'No') + return + end + %%waitALittle; + setWindowOnTop(base,'false') + [filename2,pathname2]=uigetfile('*.txt', 'Load individual coordinates'); + if filename2==0 + return + end + + ninds = max(data(:,end)); + coordinates = load([pathname2 filename2]); + %viallinen = testaaKoordinaatit(ninds, coordinates); + [viallinen coordinates] = testaaKoordinaatit(ninds, coordinates); % added by Lu Cheng, 05.12.2012 + + if viallinen + disp('Incorrect coordinates'); + return + end + inp = [filename ' & ' filename2]; + h0 = findobj('Tag','filename1_text'); + set(h0,'String',inp); + clear h0; clear inp; + clear filename; clear filename2; clear pathname; clear pathname2; + + disp('Pre-processing the data. This may take several minutes.'); + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + save_preproc = questdlg('Do you wish to save pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + %%waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + kokonimi = [pathname filename]; + c.cliques = cliques; c.separators = separators; + c.vorPoints = vorPoints; c.vorCells = vorCells; + c.pointers = pointers; c.coordinates = coordinates; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + clear c; + end; + end + + otherwise + return; +end + +global PARTITION; global COUNTS; +global SUMCOUNTS; +global SEPCOUNTS; global CLIQCOUNTS; +clearGlobalVars; + +c.data=data; c.alleleCodes = alleleCodes; +c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; +c.dist=dist; c.Z=Z; c.rowsFromInd = rowsFromInd; +c.cliques = cliques; c.separators = separators; + +ninds = length(unique(data(:,end))); +ekat = (1:rowsFromInd:ninds*rowsFromInd)'; +c.rows = [ekat ekat+rowsFromInd-1]; + +% partition compare +if ~isempty(partitionCompare) + nsamplingunits = size(c.rows,1); + data = data(:,1:end-1); + partitions = partitionCompare.partitions; + npartitions = size(partitions,2); + partitionLogml = zeros(1,npartitions); + for i = 1:npartitions + % number of unique partition lables + npops = length(unique(partitions(:,i))); + try + partitionInd = zeros(ninds*rowsFromInd,1); + partitionSample = partitions(:,i); + for j = 1:nsamplingunits + partitionInd([c.rows(j,1):c.rows(j,2)]) = partitionSample(j); + end + [sumcounts, counts] = initialCounts(partitionInd, data, npops, c.rows, noalle, adjprior); + [cliqcounts, sepcounts] = computeCounts(cliques, separators, npops, partitionSample); + partitionLogml(i) = ... + computeLogml(adjprior, priorTerm, cliqcounts, sepcounts, counts, sumcounts); + catch + disp('*** ERROR: unmatched data.'); + return + end + end + % return the logml result + partitionCompare.logmls = partitionLogml; + set(h1, 'userdata', partitionCompare); + return +end + +if fixedK + [logml, npops, partitionSummary]=spatialMix_fixK(c); +else + [logml, npops, partitionSummary]=spatialMix(c); +end + +if logml==1 + return +end + +data = data(:,1:end-1); + +h0 = findobj('Tag','filename1_text'); inp = get(h0,'String'); +h0 = findobj('Tag','filename2_text'); +outp = get(h0,'String'); +[varmuus,changesInLogml] = writeMixtureInfo(logml, rowsFromInd, data, adjprior, priorTerm, ... + outp,inp,partitionSummary, popnames, cliques, separators, fixedK); + + + +viewMixPartition(PARTITION, popnames); + +if isequal(popnames, []) + names = pointers; +else + names = cell(size(pointers)); + indices = zeros(size(popnames(:,2))); + for i=1:length(popnames(:,2)); + indices(i) = popnames{i,2}; + end + for i = 1:length(pointers) + inds = pointers{i}; + namesInCell = []; + for j = 1:length(inds) + ind = inds(j); + I = find(indices > ind); + if isempty(I) + nameIndex = length(indices); + else + nameIndex = min(I) -1; + end + name = popnames{nameIndex}; + namesInCell = [namesInCell name]; + end + names{i} = namesInCell; + end +end +vorPlot(vorPoints, vorCells, PARTITION, pointers, coordinates, names); + +%varmuus = 1 - laskeVarmuus(rowsFromInd, data, adjprior, priorTerm, ... +% logml, cliques, separators, ninds); + +%plotVarmuus(vorPoints, vorCells, pointers, varmuus, coordinates, ... +% PARTITION, names); + +talle = questdlg(['Do you want to save the mixture populations ' ... + 'so that you can use them later in admixture analysis or plot ' ... + 'additional images?'], ... + 'Save results?','Yes','No','Yes'); +if isequal(talle,'Yes') + %%waitALittle; % Hetki odotusta, jotta muistaa kysy?.. + [filename, pathname] = uiputfile('*.mat','Save results as'); + + if (filename == 0) & (pathname == 0) + % Cancel was pressed + return + else % copy 'baps4_output.baps' into the text file with the same name. + if exist(OUTPUT_FILE,'file') + copyfile(OUTPUT_FILE,[pathname filename '.txt']) + delete(OUTPUT_FILE) + end + end + + c.PARTITION = PARTITION; c.COUNTS = COUNTS; c.SUMCOUNTS = SUMCOUNTS; + c.alleleCodes = alleleCodes; c.adjprior = adjprior; c.popnames = popnames; + c.rowsFromInd = rowsFromInd; c.data = data; c.npops = npops; + c.noalle = noalle; c.mixtureType = 'spatial'; + c.pointers = pointers; c.vorPoints = vorPoints; c.vorCells = vorCells; + c.coordinates = coordinates; c.varmuus = varmuus; c.names = names; + c.changesInLogml = changesInLogml; % added by jing - 22.11.2006 + c.logml = logml; + + % added by Lu Cheng, 05.12.2012 + tmpFile = [pathname filename '.mapfile.txt']; + fid = fopen(tmpFile,'w+'); + fprintf(fid,'Name\tLatitude\tLongitude\tDescription\tLabel\n'); + if exist('heds','var') + for i=1:length(heds) + fprintf(fid,'%s\t%.10f\t%.10f\t%s_%d\t%d\n',heds{i},coordinates(i,1),coordinates(i,2),... + heds{i},PARTITION(i),PARTITION(i)); + end + else + for i=1:ninds + fprintf(fid,'%d\t%.10f\t%.10f\t%d_%d\t%d\n',i,coordinates(i,1),coordinates(i,2),... + i,PARTITION(i),PARTITION(i)); + end + end + fclose(fid); + +% save([pathname filename], 'c'); + save([pathname filename], 'c','-v7.3'); % added by Lu Cheng, 08.06.2012 +else + if exist(OUTPUT_FILE,'file') + delete(OUTPUT_FILE) + end +end + +%-------------------------------------------------------------------------- +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; + +global SEPCOUNTS; SEPCOUNTS = []; +global CLIQCOUNTS; CLIQCOUNTS = []; + + +%------------------------------------------------------------------------------------- + + +function rows = computeRows(rowsFromInd, inds, ninds) +% On annettu yksilöt inds. Funktio palauttaa vektorin, joka +% sisältää niiden rivien numerot, jotka sisältävät yksilöiden +% dataa. + +rows = inds(:, ones(1,rowsFromInd)); +rows = rows*rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [ninds 1]); +rows = rows - miinus; +rows = reshape(rows', [1,rowsFromInd*ninds]); + + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + global PARTITION; + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + + +%-------------------------------------------------------------------------- + + +function [suurin, i2] = arvoSeuraavaTila(muutokset, logml) +% Suorittaa yksilön seuraavan tilan arvonnan + +y = logml + muutokset; % siirron jälkeiset logml:t +y = y - max(y); +y = exp(y); +summa = sum(y); +y = y/summa; +y = cumsum(y); + +i2 = rand_disc(y); % uusi kori +suurin = muutokset(i2); + + +%-------------------------------------------------------------------------------------- + + +function svar=rand_disc(CDF) +%returns an index of a value from a discrete distribution using inversion method +slump=rand; +har=find(CDF>slump); +svar=har(1); + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, rowsFromInd, diffInCounts, ... + cliques, separators, adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun yksil?ind +% siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global POP_LOGML; +global CLIQCOUNTS; +global SEPCOUNTS; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); +diffInSepCounts = computeDiffInCliqCounts(separators, ind); + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + +%POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2(i1, i2, rowsFromInd, diffInCounts, ... + cliques, separators, adjprior, priorTerm); +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksilöt siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +%global POP_LOGML; +global CLIQCOUNTS; +global SEPCOUNTS; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +diffInCliqCounts = CLIQCOUNTS(:,i1); +diffInSepCounts = SEPCOUNTS(:,i1); + + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = 0; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = 0; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, rowsFromInd, diffInCounts, ... + adjprior, priorTerm, i2, cliques, separators); +% Suorittaa globaalien muuttujien päivitykset, kun yksilöt 'muuttuvat' +% siirretään koriin i2. Ennen siirtoa yksilöiden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; global CLIQCOUNTS; +global SUMCOUNTS; global SEPCOUNTS; +%global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +diffInCliqCounts = computeDiffInCliqCounts(cliques, muuttuvat); +diffInSepCounts = computeDiffInCliqCounts(separators, muuttuvat); + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + + +%POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, rowsFromInd, data, adjprior, priorTerm) +% Palauttaa yksilöt järjestyksess?siten, ett?ensimmäisen?on +% se, jonka poistaminen populaatiosta pop nostaisi logml:n +% arvoa eniten. + +global COUNTS; global SUMCOUNTS; +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind = inds(i); + rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop) = COUNTS(:,:,pop)-diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)-diffInSumCounts; + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior, priorTerm); + COUNTS(:,:,pop) = COUNTS(:,:,pop)+diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)+diffInSumCounts; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset(ind, rowsFromInd, ... + data, adjprior, priorTerm, logml, cliques, separators) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?ind siirretään koriin i. +% diffInCounts on poistettava COUNTS:in siivusta i1 ja lisättäv? +% COUNTS:in siivuun i2, mikäli muutos toteutetaan. + + +global COUNTS; global SUMCOUNTS; +global PARTITION; %global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +[emptyPop, pops] = findEmptyPop(npops); + +i1 = PARTITION(ind); +i2 = [pops(find(pops~=i1))]; +if emptyPop > 0 + i2 =[i2 emptyPop]; +end + +rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); +diffInSepCounts = computeDiffInCliqCounts(separators, ind); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; + +for i=i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; +end + +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) + diffInSepCounts; + +% Asetetaan muillekin tyhjille populaatioille sama muutos, kuin +% emptyPop:lle + +if emptyPop > 0 + empties = mysetdiff((1:npops), [i2 i1]); + muutokset(empties) = muutokset(emptyPop); +end + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2(i1, rowsFromInd, ... + data, adjprior, priorTerm, logml, cliques, separators); +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli korin i1 kaikki yksilöt siirretään +% koriin i. +% Laskee muutokset vain yhdelle tyhjälle populaatiolle, muille tulee +% muutokseksi 0. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +[emptyPop, pops] = findEmptyPop(npops); + +i2 = [pops(find(pops~=i1))]; +if emptyPop > 0 + i2 =[i2 emptyPop]; +end + +inds = find(PARTITION == i1); +rows = computeRows(rowsFromInd, inds, length(inds)); + +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); +diffInCliqCounts = computeDiffInCliqCounts(cliques, inds); +diffInSepCounts = computeDiffInCliqCounts(separators, inds); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +CLIQCOUNTS(:,i1) = 0; +SEPCOUNTS(:,i1) = 0; + +for i=i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; +end + +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; +CLIQCOUNTS(:,i1) = diffInCliqCounts; +SEPCOUNTS(:,i1) = diffInSepCounts; + + + +%------------------------------------------------------------------------------------ + +function muutokset = laskeMuutokset3(T2, inds2, rowsFromInd, ... + data, adjprior, priorTerm, i1, logml, cliques, separators) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mik?olisi muutos logml:ss? jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirretään koriin j. +% Laskee vain yhden tyhjän populaation, muita kohden muutokseksi jää 0. + + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = computeRows(rowsFromInd, inds, ninds); + + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + diffInCliqCounts = computeDiffInCliqCounts(cliques, inds); + diffInSepCounts = computeDiffInCliqCounts(separators, inds); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; + SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; + + [emptyPop, pops] = findEmptyPop(npops); + i2 = [pops(find(pops~=i1))]; + if emptyPop > 0 + i2 =[i2 emptyPop]; + end + + for i = i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(pop2,i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; + end + + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) + diffInCliqCounts; + SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) + diffInSepCounts; + end +end + +%-------------------------------------------------------------------------- + +function muutokset = laskeMuutokset5(inds, rowsFromInd, data, adjprior, ... + priorTerm, logml, cliques, separators, i1, i2) + +% Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?i vaihtaisi koria i1:n ja i2:n välill? + +global COUNTS; global SUMCOUNTS; +global PARTITION; +global CLIQCOUNTS; global SEPCOUNTS; + +ninds = length(inds); +muutokset = zeros(ninds,1); +cliqsize = size(CLIQCOUNTS,2); +sepsize = size(SEPCOUNTS, 2); + +for i = 1:ninds + ind = inds(i); + if PARTITION(ind)==i1 + pop1 = i1; %mist? + pop2 = i2; %mihin + else + pop1 = i2; + pop2 = i1; + end + rows = (ind-1)*rowsFromInd+1 : ind*rowsFromInd; + + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); + diffInSepCounts = computeDiffInCliqCounts(separators, ind); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)-diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts; + + CLIQCOUNTS(:,pop1) = CLIQCOUNTS(:,pop1) - diffInCliqCounts; + CLIQCOUNTS(:,pop2) = CLIQCOUNTS(:,pop2) + diffInCliqCounts; + SEPCOUNTS(:,pop1) = SEPCOUNTS(:,pop1) - diffInSepCounts; + SEPCOUNTS(:,pop2) = SEPCOUNTS(:,pop2) + diffInSepCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; + + CLIQCOUNTS(:,pop1) = CLIQCOUNTS(:,pop1) + diffInCliqCounts; + CLIQCOUNTS(:,pop2) = CLIQCOUNTS(:,pop2) - diffInCliqCounts; + SEPCOUNTS(:,pop1) = SEPCOUNTS(:,pop1) + diffInSepCounts; + SEPCOUNTS(:,pop2) = SEPCOUNTS(:,pop2) - diffInSepCounts; + +end + +%-------------------------------------------------------------------------- + +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n +% riveill?rows. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + + + +%------------------------------------------------------------------------------------ + + +function popLogml = computePopulationLogml(pops, adjprior, priorTerm) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS(pops,:)),2) - priorTerm; + +%------------------------------------------------------------------------------------ + +function npops = poistaTyhjatPopulaatiot(npops) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; +global CLIQCOUNTS; +global SEPCOUNTS; + +notEmpty = find(any(SUMCOUNTS,2)); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); +CLIQCOUNTS = CLIQCOUNTS(:,notEmpty); +SEPCOUNTS = SEPCOUNTS(:,notEmpty); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%---------------------------------------------------------------------------------- +%Seuraavat kolme funktiota liittyvat alkupartition muodostamiseen. + +function initial_partition=admixture_initialization(data_matrix,nclusters,Z) +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +T=cluster_own(Z,nclusters); +initial_partition=zeros(size_data(1),1); +for i=1:n + kori=T(i); + here=find(data_matrix(:,end)==i); + for j=1:length(here) + initial_partition(here(j),1)=kori; + end +end + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + + +%--------------------------------------------------------------------------------------- + + +function [newData, rowsFromInd, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? +% kyseinen rivi on peräisin. Funktio tutkii ensin, ett?montako +% rivi?maksimissaan on peräisin yhdelt?yksilölt? jolloin saadaan +% tietää onko kyseess?haploidi, diploidi jne... Tämän jälkeen funktio +% lisää tyhji?rivej?niille yksilöille, joilta on peräisin vähemmän +% rivej?kuin maksimimäär? +% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin +% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytöss?oleva koodi. +% Tämän jälkeen funktio muuttaa alleelikoodit siten, ett?yhden lokuksen j +% koodit saavat arvoja välill?1,...,noalle(j). +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +dataApu = []; nollat = []; isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +nrows = size(data,1); +ncols = size(data,2); +rowsFromInd = zeros(nind,1); +for i=1:nind + rowsFromInd(i) = length(find(data(:,end)==i)); +end +maxRowsFromInd = max(rowsFromInd); +a = -999; +emptyRow = repmat(a, 1, ncols); +lessThanMax = find(rowsFromInd < maxRowsFromInd); +missingRows = maxRowsFromInd*nind - nrows; +data = [data; zeros(missingRows, ncols)]; +pointer = 1; +for ind=lessThanMax' %Käy läpi ne yksilöt, joilta puuttuu rivej? + miss = maxRowsFromInd-rowsFromInd(ind); % Tält?yksilölt?puuttuvien lkm. + for j=1:miss + rowToBeAdded = emptyRow; + rowToBeAdded(end) = ind; + data(nrows+pointer, :) = rowToBeAdded; + pointer = pointer+1; + end +end +data = sortrows(data, ncols); % Sorttaa yksilöiden mukaisesti +newData = data; +rowsFromInd = maxRowsFromInd; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + + +%---------------------------------------------------------------------------------------- + + +function [Z, dist] = newGetDistances(data, rowsFromInd) + +ninds = max(data(:,end)); +nloci = size(data,2)-1; +riviLkm = nchoosek(ninds,2); + +empties = find(data<0); +data(empties)=0; +data = uint8(data); % max(noalle) oltava <256 + +pariTaulu = zeros(riviLkm,2); +aPointer=1; +for a=1:ninds-1 + pariTaulu(aPointer:aPointer+ninds-1-a,1) = ones(ninds-a,1)*a; + pariTaulu(aPointer:aPointer+ninds-1-a,2) = (a+1:ninds)'; + aPointer = aPointer+ninds-a; +end + +eka = pariTaulu(:,ones(1,rowsFromInd)); +eka = eka * rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]); +eka = eka - miinus; + +toka = pariTaulu(:,ones(1,rowsFromInd)*2); +toka = toka * rowsFromInd; +toka = toka - miinus; + +%eka = uint16(eka); +%toka = uint16(toka); + +summa = zeros(riviLkm,1); +vertailuja = zeros(riviLkm,1); + +clear pariTaulu; clear miinus; + +x = zeros(size(eka)); x = uint8(x); +y = zeros(size(toka)); y = uint8(y); + +for j=1:nloci; + + for k=1:rowsFromInd + x(:,k) = data(eka(:,k),j); + y(:,k) = data(toka(:,k),j); + end + + for a=1:rowsFromInd + for b=1:rowsFromInd + vertailutNyt = double(x(:,a)>0 & y(:,b)>0); + vertailuja = vertailuja + vertailutNyt; + lisays = (x(:,a)~=y(:,b) & vertailutNyt); + summa = summa+double(lisays); + end + end +end + +clear x; clear y; clear vertailutNyt; +nollat = find(vertailuja==0); +dist = zeros(length(vertailuja),1); +dist(nollat) = 1; +muut = find(vertailuja>0); +dist(muut) = summa(muut)./vertailuja(muut); +clear summa; clear vertailuja; + +Z = linkage(dist'); + + +%---------------------------------------------------------------------------------------- + + +function [Z, distances]=getDistances(data_matrix,nclusters) + +%finds initial admixture clustering solution with nclusters clusters, uses simple mean Hamming distance +%gives partition in 8-bit format +%allocates all alleles of a single individual into the same basket +%data_matrix contains #Loci+1 columns, last column indicate whose alleles are placed in each row, +%i.e. ranges from 1 to #individuals. For diploids there are 2 rows per individual, for haploids only a single row +%missing values are indicated by zeros in the partition and by negative integers in the data_matrix. + +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +distances=zeros(nchoosek(n,2),1); +pointer=1; +for i=1:n-1 + i_data=data_matrix(find(data_matrix(:,end)==i),1:nloci); + for j=i+1:n + d_ij=0; + j_data=data_matrix(find(data_matrix(:,end)==j),1:nloci); + vertailuja = 0; + for k=1:size(i_data,1) + for l=1:size(j_data,1) + here_i=find(i_data(k,:)>=0); + here_j=find(j_data(l,:)>=0); + here_joint=intersect(here_i,here_j); + vertailuja = vertailuja + length(here_joint); + d_ij = d_ij + length(find(i_data(k,here_joint)~=j_data(l,here_joint))); + end + end + d_ij = d_ij / vertailuja; + distances(pointer)=d_ij; + pointer=pointer+1; + end +end + +Z=linkage(distances'); + + + +%---------------------------------------------------------------------------------------- + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile, indexFile) +%Palauttaa tyhjän, mikäli nimitiedosto ja indeksitiedosto +% eivät olleet yht?pitki? + +popnames = []; +indices = load(indexFile); + +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) & ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +if length(names) ~= length(indices) + disp('The number of population names must be equal to the number of '); + disp('entries in the file specifying indices of the first individuals of '); + disp('each population.'); + return; +end + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = indices(i); +end + + +%----------------------------------------------------------------------------------- +% Laskee arvot cliqcounts:lle ja sepcounts:lle + + +function [cliqcounts, sepcounts] = computeCounts(cliques, separators, npops, PARTITION) + + +ncliq = size(cliques,1); +nsep = size(separators,1); + +cliqPartition = zeros(ncliq, size(cliques,2)); +sepPartition = zeros(nsep, size(separators, 2)); + +apuCliq = find(cliques > 0); +apuSep = find(separators > 0); + +cliqPartition(apuCliq) = PARTITION(cliques(apuCliq)); +sepPartition(apuSep) = PARTITION(separators(apuSep)); + + +cliqcounts = zeros(ncliq, npops); +for i = 1:npops + cliqcounts(:,i) = sum(cliqPartition == i, 2); +end + + +sepcounts = zeros(nsep, npops); +for i = 1:npops + sepcounts(:,i) = sum(sepPartition == i, 2); +end + +%------------------------------------------------------------------------- + +function diffInCliqCounts = computeDiffInCliqCounts(cliques, inds) +% Laskee muutoksen CLIQCOUNTS:ssa (tai SEPCOUNTS:ssa, jos syötteen? +% separators) kun yksilöt inds siirretään. +% diffInCliqcounts on ncliq*1 taulu, joka on CLIQCOUNTS:n sarakkeesta josta +% yksilöt inds siirretään ja lisättäv?sarakkeeseen, johon yksilöt +% siirretään. + +ncliq = size(cliques,1); +diffInCliqCounts = zeros(ncliq,1); +ninds = length(inds); +for i = 1:ninds + ind = inds(i); + rivit = sum((cliques == ind),2); + diffInCliqCounts = diffInCliqCounts + rivit; +end + + +%----------------------------------------------------------------------- + +function [logml, spatialPrior] = computeLogml(adjprior,priorTerm, ... + CLIQCOUNTS, SEPCOUNTS, ... + COUNTS, SUMCOUNTS) + +notEmpty = any(CLIQCOUNTS); +npops = length(find(notEmpty == 1)); +sumcliq=sum(CLIQCOUNTS, 2); +sumsep=sum(SEPCOUNTS, 2); +ncliq = size(CLIQCOUNTS, 1); +nsep = size(SEPCOUNTS, 1); + +cliqsizes = sum(CLIQCOUNTS, 2)'; +sepsizes = sum(SEPCOUNTS, 2)'; +cliqsizes = min([cliqsizes; npops*ones(1,ncliq)])'; +sepsizes = min([sepsizes; npops*ones(1,nsep)])'; + +klikkitn = sum(sum(gammaln(CLIQCOUNTS(:,notEmpty) + repmat(1./cliqsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./cliqsizes))) ... + - sum(gammaln(sumcliq + 1)); + +septn = sum(sum(gammaln(SEPCOUNTS(:,notEmpty) + repmat(1./sepsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./sepsizes))) ... + - sum(gammaln(sumsep + 1)); + + +%klikkitn = sum(sum(gammaln(CLIQCOUNTS + 1/npops))) ... +% - ncliq*npops*(gammaln(1/npops)) ... +% - sum(gammaln(sumcliq + 1)); +%septn = sum(sum(gammaln(SEPCOUNTS + 1/npops))) ... +% - nsep*npops*(gammaln(1/npops)) ... +% - sum(gammaln(sumsep + 1)); + +spatialPrior = (klikkitn - septn); + +%if spatialPrior > 0 +% keyboard +%end + + +x = size(COUNTS,1); +y = size(COUNTS,2); +z = size(COUNTS,3); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 z]) + COUNTS) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS),2) - priorTerm; + +logml = sum(popLogml) + spatialPrior; + +%-------------------------------------------------------------------------- + + +function initializeGammaln(ninds, rowsFromInd, maxSize) +%Alustaa GAMMALN muuttujan s.e. GAMMALN(i,j)=gammaln((i-1) + 1/j) +global GAMMA_LN; +GAMMA_LN = zeros((1+ninds)*rowsFromInd, maxSize); +for i=1:(ninds+1)*rowsFromInd + for j=1:maxSize + GAMMA_LN(i,j)=gammaln((i-1) + 1/j); + end +end + + +%---------------------------------------------------------------------------- + + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäär? + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + + +%---------------------------------------------------------------------------- + + + +function kunnossa = testaaGenePopData(tiedostonNimi) +% kunnossa == 0, jos data ei ole kelvollinen genePop data. +% Muussa tapauksessa kunnossa == 1. + +kunnossa = 0; +fid = fopen(tiedostonNimi); +line1 = fgetl(fid); %ensimmäinen rivi +line2 = fgetl(fid); %toinen rivi +line3 = fgetl(fid); %kolmas + +if (isequal(line1,-1) | isequal(line2,-1) | isequal(line3,-1)) + disp('Incorrect file format 1168'); fclose(fid); + return +end +if (testaaPop(line1)==1 | testaaPop(line2)==1) + disp('Incorrect file format 1172'); fclose(fid); + return +end +if testaaPop(line3)==1 + %2 rivi tällöin lokusrivi + nloci = rivinSisaltamienMjonojenLkm(line2); + line4 = fgetl(fid); + if isequal(line4,-1) + disp('Incorrect file format 1180'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin nelj?täytyy sisältää pilkku. + disp('Incorrect file format 1185'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy + pointer = pointer+1; + end + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1195'); fclose(fid); + return + end +else + line = fgetl(fid); + lineNumb = 4; + while (testaaPop(line)~=1 & ~isequal(line,-1)) + line = fgetl(fid); + lineNumb = lineNumb+1; + end + if isequal(line,-1) + disp('Incorrect file format 1206'); fclose(fid); + return + end + nloci = lineNumb-2; + line4 = fgetl(fid); %Eka rivi pop sanan jälkeen + if isequal(line4,-1) + disp('Incorrect file format 1212'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin täytyy sisältää pilkku. + disp('Incorrect file format 1217'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy. + pointer = pointer+1; + end + + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1228'); fclose(fid); + return + end +end +kunnossa = 1; +fclose(fid); + +%------------------------------------------------------ + + +function [data, popnames] = lueGenePopData(tiedostonNimi) + +fid = fopen(tiedostonNimi); +line = fgetl(fid); %ensimmäinen rivi +line = fgetl(fid); %toinen rivi +count = rivinSisaltamienMjonojenLkm(line); + +line = fgetl(fid); +lokusRiveja = 1; +while (testaaPop(line)==0) + lokusRiveja = lokusRiveja+1; + line = fgetl(fid); +end + +if lokusRiveja>1 + nloci = lokusRiveja; +else + nloci = count; +end + +popnames = cell(10,2); +data = zeros(100, nloci+1); +nimienLkm=0; +ninds=0; +poimiNimi=1; +digitFormat = -1; +while line ~= -1 + line = fgetl(fid); + + if poimiNimi==1 + %Edellinen rivi oli 'pop' + nimienLkm = nimienLkm+1; + ninds = ninds+1; + if nimienLkm>size(popnames,1); + popnames = [popnames; cell(10,2)]; + end + nimi = lueNimi(line); + if digitFormat == -1 + digitFormat = selvitaDigitFormat(line); + divider = 10^digitFormat; + end + popnames{nimienLkm, 1} = {nimi}; %Näin se on greedyMix:issäkin?!? + popnames{nimienLkm, 2} = ninds; + poimiNimi=0; + + data = addAlleles(data, ninds, line, divider); + + elseif testaaPop(line) + poimiNimi = 1; + + elseif line ~= -1 + ninds = ninds+1; + data = addAlleles(data, ninds, line, divider); + end +end + +data = data(1:ninds*2,:); +popnames = popnames(1:nimienLkm,:); +fclose(fid); + + +%------------------------------------------------------- + +function nimi = lueNimi(line) +%Palauttaa line:n alusta sen osan, joka on ennen pilkkua. +n = 1; +merkki = line(n); +nimi = ''; +while ~isequal(merkki,',') + nimi = [nimi merkki]; + n = n+1; + merkki = line(n); +end + +%------------------------------------------------------- + +function df = selvitaDigitFormat(line) +% line on ensimmäinen pop-sanan jälkeinen rivi +% Genepop-formaatissa olevasta datasta. funktio selvittää +% rivin muodon perusteella, ovatko datan alleelit annettu +% 2 vai 3 numeron avulla. + +n = 1; +merkki = line(n); +while ~isequal(merkki,',') + n = n+1; + merkki = line(n); +end + +while ~any(merkki == '0123456789'); + n = n+1; + merkki = line(n); +end +numeroja = 0; +while any(merkki == '0123456789'); + numeroja = numeroja+1; + n = n+1; + merkki = line(n); +end + +df = numeroja/2; + + +%------------------------------------------------------ + + +function count = rivinSisaltamienMjonojenLkm(line) +% Palauttaa line:n sisältämien mjonojen lukumäärän. +% Mjonojen väliss?täytyy olla välilyönti. +count = 0; +pit = length(line); +tila = 0; %0, jos odotetaan välilyöntej? 1 jos odotetaan muita merkkej? +for i=1:pit + merkki = line(i); + if (isspace(merkki) & tila==0) + %Ei tehd?mitään. + elseif (isspace(merkki) & tila==1) + tila = 0; + elseif (~isspace(merkki) & tila==0) + tila = 1; + count = count+1; + elseif (~isspace(merkki) & tila==1) + %Ei tehd?mitään + end +end + +%------------------------------------------------------- + +function pal = testaaPop(rivi) +% pal=1, mikäli rivi alkaa jollain seuraavista +% kirjainyhdistelmist? Pop, pop, POP. Kaikissa muissa +% tapauksissa pal=0. + +if length(rivi)<3 + pal = 0; + return +end +if (all(rivi(1:3)=='Pop') | ... + all(rivi(1:3)=='pop') | ... + all(rivi(1:3)=='POP')) + pal = 1; + return +else + pal = 0; + return +end + + +%-------------------------------------------------------- + + +function data = addAlleles(data, ind, line, divider) +% Lisaa BAPS-formaatissa olevaan datataulukkoon +% yksilöä ind vastaavat rivit. Yksilön alleelit +% luetaan genepop-formaatissa olevasta rivist? +% line. Jos data on 3 digit formaatissa on divider=1000. +% Jos data on 2 digit formaatissa on divider=100. + +nloci = size(data,2)-1; +if size(data,1) < 2*ind + data = [data; zeros(100,nloci+1)]; +end + +k=1; +merkki=line(k); +while ~isequal(merkki,',') + k=k+1; + merkki=line(k); +end +line = line(k+1:end); +clear k; clear merkki; + +alleeliTaulu = sscanf(line,'%d'); + +if length(alleeliTaulu)~=nloci + disp('Incorrect data format.'); +end + +for j=1:nloci + ekaAlleeli = floor(alleeliTaulu(j)/divider); + if ekaAlleeli==0 ekaAlleeli=-999; end; + tokaAlleeli = rem(alleeliTaulu(j),divider); + if tokaAlleeli==0 tokaAlleeli=-999; end + + data(2*ind-1,j) = ekaAlleeli; + data(2*ind,j) = tokaAlleeli; +end + +data(2*ind-1,end) = ind; +data(2*ind,end) = ind; + +%------------------------------------------------------------------- + +function [varmuus,changesInLogml] = writeMixtureInfo(logml, rowsFromInd, data, adjprior, ... + priorTerm, outPutFile, inputFile, partitionSummary, popnames, ... + cliques, separators, fixedK) + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global LOGDIFF; + +OUTPUT_FILE = 'baps6_output.baps'; + +ninds = size(data,1)/rowsFromInd; +npops = size(COUNTS,3); +names = (size(popnames,1) == ninds); %Tarkistetaan ett?nimet viittaavat yksilöihin + +if length(outPutFile)>0 + fid = fopen(outPutFile,'a'); +else + fid = -1; + diary(OUTPUT_FILE); % save in text anyway. +end + +dispLine; +disp('RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:'); +disp(['Data file: ' inputFile]); +disp(['Number of clustered individuals: ' ownNum2Str(ninds)]); +disp(['Number of groups in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%s \n', ['RESULTS OF INDIVIDUAL LEVEL MIXTURE ANALYSIS:']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Data file: ' inputFile]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clustered individuals: ' ownNum2Str(ninds)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of groups in optimal partition: ' ownNum2Str(npops)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); fprintf(fid,'\n'); +end + +cluster_count = length(unique(PARTITION)); +disp(['Best Partition: ']); +if (fid ~= -1) + fprintf(fid,'%s \n',['Best Partition: ']); fprintf(fid,'\n'); +end +for m=1:cluster_count + indsInM = find(PARTITION==m); + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + if names + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + else + text = ['Cluster ' num2str(m) ': {' num2str(indsInM(1))]; + for k = 2:cluster_size + text = [text ', ' num2str(indsInM(k))]; + end; + end + text = [text '}']; + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',[new_line]); + fprintf(fid,'\n'); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',[text]); + fprintf(fid,'\n'); + end + end +end + +disp(' '); +disp(' '); +disp('Changes in log(marginal likelihood) if indvidual i is moved to group j:'); +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Changes in log(marginal likelihood) if indvidual i is moved to group j:']); fprintf(fid, '\n'); +end + +if names + nameSizes = zeros(ninds,1); + for i = 1:ninds + nimi = char(popnames{i}); + nameSizes(i) = length(nimi); + end + maxSize = max(nameSizes); + maxSize = max(maxSize, 5); + erotus = maxSize - 5; + alku = blanks(erotus); + ekarivi = [alku ' ind' blanks(6+erotus)]; +else + ekarivi = ' ind '; +end + +for i = 1:cluster_count + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; +end +disp(ekarivi); +if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); fprintf(fid, '\n'); +end + +ninds = size(data,1)/rowsFromInd; +varmuus = zeros(ninds,1); + +changesInLogml = LOGDIFF'; +for ind = 1:ninds + %[muutokset, diffInCounts] = laskeMuutokset(ind, rowsFromInd, data, ... + % adjprior, priorTerm, logml, cliques, separators); + %changesInLogml(:,ind) = muutokset; + muutokset = changesInLogml(:,ind); + + if sum(exp(muutokset))>0 + varmuus(ind) = 1 - 1/sum(exp(muutokset)); + else + varmuus(ind) = 0; + end + if names + nimi = char(popnames{ind}); + rivi = [blanks(maxSize - length(nimi)) nimi ':']; + else + rivi = [blanks(4-floor(log10(ind))) ownNum2Str(ind) ':']; + end + for j = 1:npops + rivi = [rivi ' ' logml2String(omaRound(muutokset(j)))]; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + end +end + +disp(' '); disp(' '); +disp('KL-divergence matrix in PHYLIP format:'); +dist_mat = zeros(npops, npops); +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['KL-divergence matrix in PHYLIP format:']); %fprintf(fid, '\n'); +end + +maxnoalle = size(COUNTS,1); +nloci = size(COUNTS,2); +d = zeros(maxnoalle, nloci, npops); +prior = adjprior; +prior(find(prior==1))=0; +nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. +prior(1,nollia)=1; +for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); +end +% ekarivi = blanks(7); +% for pop = 1:npops +% ekarivi = [ekarivi num2str(pop) blanks(7-floor(log10(pop)))]; +% end +ekarivi = num2str(npops); +disp(ekarivi); + +if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); %fprintf(fid, '\n'); +end + +for pop1 = 1:npops + rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + % rivi = [rivi kldiv2str(div) ' ']; + dist_mat(pop1,pop2) = div; + end +% disp(rivi); +% if (fid ~= -1) +% fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); +% end +end + +dist_mat = dist_mat + dist_mat'; % make it symmetric +for pop1 = 1:npops + rivi = ['Cluster_' num2str(pop1) ' ']; + for pop2 = 1:npops + rivi = [rivi kldiv2str(dist_mat(pop1,pop2)) ' ']; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); %fprintf(fid, '\n'); + end +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['List of sizes of 10 best visited partitions and corresponding log(ml) values']); fprintf(fid, '\n'); +end + +partitionSummaryKaikki = partitionSummary; +partitionSummary =[]; +for i=1:size(partitionSummaryKaikki,3) + partitionSummary = [partitionSummary; partitionSummaryKaikki(:,:,i)]; +end +[I,J] = find(partitionSummaryKaikki(:,2,:)>-1e49); +partitionSummaryKaikki = partitionSummaryKaikki(I,:,:); + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +partitionSummary = partitionSummary(find(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1)) ' ' num2str(partitionSummary(part,2))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end +end + +if ~fixedK + disp(' '); + disp(' '); + disp('Probabilities for number of clusters'); + + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Probabilities for number of clusters']); fprintf(fid, '\n'); + end + + npopsTaulu = unique(partitionSummary(:,1)); + len = length(npopsTaulu); + probs = zeros(len,1); + partitionSummary(:,2) = partitionSummary(:,2)-max(partitionSummary(:,2)); + sumtn = sum(exp(partitionSummary(:,2))); + for i=1:len + npopstn = sum(exp(partitionSummary(find(partitionSummary(:,1)==npopsTaulu(i)),2))); + probs(i) = npopstn / sumtn; + end + for i=1:len + if probs(i)>1e-5 + line = [num2str(npopsTaulu(i)) ' ' num2str(probs(i))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end + end + end +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); + +%-------------------------------------------------------------------- + + +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) & n=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + + +%-------------------------------------------------------------------------- + + +function ninds = testaaOnkoKunnollinenBapsData(data) +%Tarkastaa onko viimeisess?sarakkeessa kaikki +%luvut 1,2,...,n johonkin n:ään asti. +%Tarkastaa lisäksi, ett?on vähintään 2 saraketta. +if size(data,1)<2 + ninds = 0; return; +end +lastCol = data(:,end); +ninds = max(lastCol); +if ~isequal((1:ninds)',unique(lastCol)) + ninds = 0; return; +end + + +%-------------------------------------------------------------------------- + + +function [ninds, data, heds] = testFastaData(inFile) +% added by Lu Cheng, 11.11.2012 +if ~exist(inFile,'file') + error('Fasta file %s does not exist!\n',inFile); +end + +[heds, seqs]=fastaread(inFile); +ninds = length(seqs); + +data = cell2mat(seqs(:)); +newData = ones(size(data))*-9; +newData(ismember(data,'Aa'))=1; +newData(ismember(data,'Cc'))=2; +newData(ismember(data,'Gg'))=3; +newData(ismember(data,'Tt'))=4; +data = [newData (1:ninds)']; + + + +%-------------------------------------------------------------------------- + +function [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates) +%Laskee yksilöiden luonnolliset naapurit koordinaateista. +%Naapurit lasketaan lisäämäll?koordinaatteihin pisteit? +%jotta kutakin yksilöä vastaisi rajoitettu voronoi-solu +%Puuttuvat koordinaatit (0,0) tulevat erakkopisteiksi +% +%Määrittää lisäksi yksilöit?vastaavat voronoi tesselaation solut. +%vorPoints:ssa on solujen kulmapisteet ja vorCells:ss?kunkin solun +%kulmapisteiden indeksit. Pointers{i} sisältää solussa i olevien yksilöiden +%indeksit. + + + +ninds = length(coordinates); +[I,J] = find(coordinates>0 | coordinates <0); %Käsitellään vain yksilöit? joilta koordinaatit +I = unique(I); %olemassa +ncoords = length(I); +new_coordinates = addPoints(coordinates(I,:)); %Ympäröidään yksilöt apupisteill? + +apuData = [new_coordinates(1:ncoords,:) (1:ncoords)']; +apuData = sortrows(apuData,[1 2]); +erot = [diff(apuData(:,1)) diff(apuData(:,2))]; +empties = find(erot(:,1)==0 & erot(:,2)==0); +samat = cell(length(empties),1); +pointer = 0; + +for i = 1:length(empties) + if i == 1 | empties(i) - empties(i-1) > 1 %Tutkitaan onko eri pisteess?kuin edellinen + pointer = pointer+1; + samat{pointer} = [apuData(empties(i),3) apuData(empties(i)+1,3)]; + else + samat{pointer} = [samat{pointer} apuData(empties(i)+1,3)]; + end +end + +samat = samat(1:pointer); +erot = []; apuData = []; empties = []; + +%tri = delaunay(new_coordinates(:,1), new_coordinates(:,2), {'Qt','Qbb','Qc','Qz'}); %Apupisteiden takia ok. +tri = delaunay(new_coordinates(:,1), new_coordinates(:,2)); +%[rivi,sarake] = find(tri>ncoords); %Jätetään huomiotta apupisteet +%tri(rivi,:) = []; +pituus = tri(:,1); +pituus = length(pituus); +parit = zeros(6*pituus,2); +for i = 1:pituus %Muodostetaan kolmikoista parit + j = 6*(i-1)+1; + parit(j,:) = tri(i,1:2); + parit(j+1,:) = tri(i,1:2:3); + parit(j+2,:) = tri(i,2:3); + parit(j+3:j+5,:) = [parit(j:j+2,2) parit(j:j+2,1)]; +end +parit = unique(parit,'rows'); +[rivi,sarake] = find(parit>ncoords); %Jätetään huomiotta apupisteet +parit(rivi,:) = []; +parit = I(parit); %Otetaan poistetut takaisin mukaan +graph = sparse(parit(:,1),parit(:,2),1, ninds, ninds); + + +%Kopioidaan samassa pisteess?olevien yksilöiden naapurustot +%silt? jolle ne laitettu. + +for i = 1:length(samat); + taulu = I(samat{i}); + [rivi,sarake] = find(graph(taulu,:)>0); + if length(rivi) > 0 + kopioitava = graph(taulu(rivi(1)),:); + for j = 1:length(taulu); + graph(taulu(j),:) = kopioitava; + graph(:,taulu(j)) = kopioitava'; + end + end +end + +%Asetetaan samassa pisteess?olevat yksilöt toistensa naapureiksi + +for i = 1:length(samat) + for j = I(samat{i}) + for k = I(samat{i}) + if k ~= j + graph(j,k) = 1; + end + end + end +end + +%Laskee maksimin klikkien ja separaattorien koolle +%Määritetään myös klikit ja separaattorit + +[ncliq, nsep, cliq, sep] = laskeKlikit(graph, ninds, ninds); + +sumcliq = sum(ncliq); +sumsep = sum(nsep); +maxCliqSize = max(find(sumcliq > 0)); +maxSepSize = max(find(sumsep > 0)); + +cliques = zeros(length(cliq), maxCliqSize); +separators = zeros(length(sep), maxSepSize); + +nollia = zeros(1, length(cliq)); +for i = 1:length(cliq); + klikki = cliq{i}; + if length(klikki)>1 + cliques(i, 1:length(klikki)) = klikki; + else + nollia(i)=1; + end +end +cliques(find(nollia==1), :) = []; + + +for i = 1:length(sep); + klikki = sep{i}; + separators(i, 1:length(klikki)) = klikki; +end + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%Määritetään yksilöit?vastaavat voronoi tesselaation solut + + + +[vorPoints, vorCells] = voronoin(new_coordinates, {'Qbb', 'Qz'}); + + +bounded = ones(length(vorCells),1); +for i=1:length(vorCells) + if (isempty(vorCells{i})) | (length(find(vorCells{i}==1))>0) + bounded(i)=0; + end +end + + + +vorCells = vorCells(bounded == 1); + +pointers = cell(length(vorCells),1); +empties = zeros(1,length(vorCells)); +X = coordinates(:,1); +Y = coordinates(:,2); + +for i=1:length(pointers) + vx = vorPoints(vorCells{i},1); + vy = vorPoints(vorCells{i},2); + IN = inpolygon(X,Y,vx,vy); + if any(IN)==0 + empties(i) = 1; + else + pointers{i} = find(IN ==1); + end +end + + +%figure +%hold on +% +%for i = 1:length(vorCells) +% vx = vorPoints(vorCells{i},1); +% vy = vorPoints(vorCells{i},2); +% k = convhull(vx,vy); +% if any(pointers{i}) +% patch(vx(k), vy(k),'y'); +% else +% plot(vx(k), vy(k)); +% end +%end + +%plot(coordinates(:,1), coordinates(:,2), 'r*'); +%plot(new_coordinates(ninds+1:end,1), new_coordinates(ninds+1:end,2), 'b+'); +%axis([-2 7 -2 8]); + +vorCells = vorCells(find(empties == 0)); +pointers = pointers(find(empties == 0)); + + + + +%-------------------------------------------------------------------------- + +function [ncliques, nseparators, cliques, separators] = ... + laskeKlikit(M, maxCliqSize,maxSepSize) +%Laskee samankokoisten klikkien määrän verkosta M +%ncliques(i)=kokoa i olevien klikkien määr? +%nseparators vastaavasti + +ncliques=zeros(1,maxCliqSize); +nseparators=zeros(1,maxSepSize); + +if isequal(M,[]) + return; +end + +[cliques,separators]=findCliques(M); + +for i=1:length(cliques) + ncliques(length(cliques{i}))=ncliques(length(cliques{i}))+1; +end + +%cliqmax=max(find(ncliques~=0)); +%ncliques=ncliques(1:cliqmax); + +for i=1:length(separators) + nseparators(length(separators{i}))=nseparators(length(separators{i}))+1; +end + +%sepmax=max(find(nseparators~=0)); +%nseparators=nseparators(1:sepmax); + +%-------------------------------------------------------------------------- + +function C = mysetdiff(A,B) +% MYSETDIFF Set difference of two sets of positive integers (much faster than built-in setdiff) +% C = mysetdiff(A,B) +% C = A \ B = { things in A that are not in B } +% +% Original by Kevin Murphy, modified by Leon Peshkin + +if isempty(A) + C = []; + return; +elseif isempty(B) + C = A; + return; +else % both non-empty + bits = zeros(1, max(max(A), max(B))); + bits(A) = 1; + bits(B) = 0; + C = A(logical(bits(A))); +end + + +%-------------------------------------------------------------------------- + +function logml = checkLogml(priorTerm, adjprior, cliques, separators) +% tarkistaa logml:n + +global CLIQCOUNTS; +global SEPCOUNTS; +global PARTITION; + +npops = length(unique(PARTITION)); +[cliqcounts, sepcounts] = computeCounts(cliques, separators, npops); + +CLIQCOUNTS = cliqcounts; +SEPCOUNTS = sepcounts; + + +[logml, spatialPrior] = computeLogml(adjprior, priorTerm); + +disp(['logml: ' logml2String(logml) ', spatial prior: ' logml2String(spatialPrior)]); + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhji? +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end + +%-------------------------------------------------------------------------- + +function [viallinen coordinates] = testaaKoordinaatit(ninds, coordinates) +% Testaa onko koordinaatit kunnollisia. +% modified by Lu Cheng, 05.12.2012 + +viallinen = 1; +if ~isnumeric(coordinates) + warning('Coordinates are not numerical!'); + return; +end + +oikeanKokoinen = (size(coordinates,1) == ninds) & (size(coordinates,2) == 2); +if ~oikeanKokoinen + warning('Wrong coordinates dimension!'); + return; +end + +posstr = cellfun(@(x) sprintf('%.10f',x),num2cell(coordinates),'UniformOutput',false); +posstr = cellfun(@(x) regexprep(x,'0+$',''),posstr,'UniformOutput',false); + +uni1 = unique(posstr(:,1)); +uni2 = unique(posstr(:,2)); +posstr_new = posstr; + +if length(uni1)==ninds && length(uni2)==ninds + viallinen = 0; + return; +else + ans = questdlg('Input coordinates are not unique. Do you want to make them unique?','coordinates NOT unique', 'Yes','No','Yes'); + if strcmp(ans,'No') + warning('Coordinates are not unique!'); + return; + end +end + +for i=1:length(uni1) + tmpinds = find(ismember(posstr(:,1),uni1(i))); + tmpNinds = length(tmpinds); + + if tmpNinds==1 + continue; + end + + assert(tmpNinds<100); + tmparr = round(linspace(0,99,tmpNinds)); + tmparr = tmparr(randperm(tmpNinds)); + + for j=1:tmpNinds + posstr_new{tmpinds(j),1}=sprintf('%s%02d',posstr{tmpinds(j),1},tmparr(j)); + end +end + +for i=1:length(uni2) + tmpinds = find(ismember(posstr(:,2),uni2(i))); + tmpNinds = length(tmpinds); + + if tmpNinds==1 + continue; + end + + assert(tmpNinds<100); + tmparr = round(linspace(0,99,tmpNinds)); + tmparr = tmparr(randperm(tmpNinds)); + + for j=1:tmpNinds + posstr_new{tmpinds(j),2}=sprintf('%s%02d',posstr{tmpinds(j),2},tmparr(j)); + end +end + +coordinates = cellfun(@str2double,posstr_new); +uni1 = unique(coordinates(:,1)); +uni2 = unique(coordinates(:,2)); +if length(uni1)==ninds && length(uni2)==ninds + viallinen = 0; +else + warning('Can not make coordinates unique!'); +end + +%-------------------------------------------------------------------------- + +function varmuus = laskeVarmuus(rowsFromInd, data, adjprior, priorTerm, ... + logml, cliques, separators, ninds); + +varmuus = zeros(ninds,1); + +for ind=1:ninds + muutokset = laskeMuutokset(ind, rowsFromInd, data, adjprior, ... + priorTerm, logml, cliques, separators); + varmuus(ind) = 1/sum(exp(muutokset)); + +end + + +% ------------------------------------------------------------------------- +function [sumcounts, counts] = ... + initialCounts(partition, data, npops, rows, noalle, adjprior) + +nloci=size(data,2); +ninds = size(rows, 1); + +koot = rows(:,1) - rows(:,2) + 1; +maxSize = max(koot); + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + +%initializeGammaln(ninds, maxSize, max(noalle)); + +% logml = laskeLoggis(counts, sumcounts, adjprior); + diff --git a/matlab/spatial/spatialPopMixture.m b/matlab/spatial/spatialPopMixture.m new file mode 100644 index 0000000..ad9ec13 --- /dev/null +++ b/matlab/spatial/spatialPopMixture.m @@ -0,0 +1,2645 @@ +function spatialPopMixture() +%Vaihtuvalla populaatioiden määräll? priori 3:lla +%Samassa pisteess?olevien yksilöiden täytyy olla samasta populaatiosta. +%Toiminta pitkälti samanlainen kuin greedyPopMixiss? + +base = findobj('Tag','base_figure'); % added by Lu Cheng, 11.11.2012 + +% check whether fixed k mode is selected +h0 = findobj('Tag','fixk_menu'); +fixedK = get(h0, 'userdata'); + +if fixedK + if ~(fixKWarning == 1) % call function fixKWarning + return + end +end + +% output file name +OUTPUT_FILE = 'baps5_output.baps'; % also remember to update the file name in function WriteMixtureInfo + +% check whether partition compare mode is selected +h1 = findobj('Tag','partitioncompare_menu'); +partitionCompare = get(h1, 'userdata'); + +formatList = {'BAPS-format','FASTA-format', 'GenePop-format', 'Preprocessed data'}; +formatChoice = menu('Specify the format of your data: ','BAPS-format','FASTA-format', 'GenePop-format', 'Preprocessed data'); +if formatChoice==0 + return; +else + input_type = formatList{formatChoice}; +end + +% input_type = questdlg('Specify the format of your data: ',... +% 'Specify Data Format', ... +% 'BAPS-format', 'GenePop-format', 'Preprocessed data', ... +% 'BAPS-format'); + +switch input_type + +case 'BAPS-format' + waitALittle; + [filename1, pathname1] = uigetfile('*.txt', 'Load data in BAPS-format'); + if filename1==0 + return; + end + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname1 filename1]); + end + + data = load([pathname1 filename1]); + ninds = testaaOnkoKunnollinenBapsData(data); %TESTAUS + if (ninds==0) + disp('Incorrect Data-file.'); + return; + end + + waitALittle; + [filename2,pathname2]=uigetfile('*.txt', 'Load group coordinates'); + if filename2==0 + return + end + + coordinates = load([pathname2 filename2]); + %viallinen = testaaKoordinaatit(ninds, coordinates); + [viallinen coordinates] = testaaKoordinaatit(ninds, coordinates); % added by Lu Cheng, 05.12.2012 + if viallinen + disp('Incorrect coordinates'); + return + end + + inp = [filename1 ' & ' filename2]; + h0 = findobj('Tag','filename1_text'); + set(h0,'String',inp); + clear h0; clear inp; + clear filename1; clear filename2; clear pathname1; clear pathname2; + + load_names = questdlg('Do you wish to specify the names of the groups?',... + 'Input group names?','Yes','No','Yes'); + if isequal(load_names,'Yes') + waitALittle; + [filename, pathname] = uigetfile('*.txt', 'Load group names'); + popnames = initPopNames([pathname filename]); + if (size(popnames,1)~=ninds) + disp('Incorrect name-file.'); + popnames = []; + end + else + popnames = []; + end + + disp('Pre-processing the data. This may take several minutes.'); + + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + [data, rows, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + [Z,dist] = newGetDistances(data,rows); + + rowsFromInd = 0; % Ei tiedet? + + save_preproc = questdlg('Do you wish to save pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + kokonimi = [pathname filename]; + c.data = data; c.rows = rows; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.dist = dist; c.popnames = popnames; c.Z = Z; + c.cliques = cliques; c.separators = separators; + c.vorPoints = vorPoints; c.rowsFromInd = rowsFromInd; + c.vorCells = vorCells; c.pointers = pointers; + c.coordinates = coordinates; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + clear c; + end; + +%%%%%%%%%%%%% added by Lu Cheng 11.11.2012 START %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +case 'FASTA-format' + +% setWindowOnTop(base,'false') + [filename1, pathname1] = uigetfile({'*.fasta';'*.*'}, 'Load data in FASTA-format'); + if filename1==0 + return; + end + + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname filename]); + end + %data = load([pathname1 filename1]); + [~, seqs] = fastaread([pathname1 filename1]); + seqs = seqs(:); + alnMat = cell2mat(seqs); + nSeq = length(seqs); + clear seqs; + + cc = preprocAln(alnMat); + + setWindowOnTop(base,'false') + [filename1_1,pathname1_1]=uigetfile('*.txt', 'Load group partition'); + if filename1_1==0 + return + end + groupPartition = load([pathname1_1 filename1_1]); + if nSeq~=length(groupPartition) + warning('Number of individuals inconsistent in %s (%d) and %s (%d).',... + filename1,nSeq,filename1_1,length(groupPartition)); + return; + else + nPregroup = length(unique(groupPartition)); + ninds = nPregroup; + assert(nPregroup==max(groupPartition)); + + cc.nPregroup = nPregroup; + cc.groupPartition = groupPartition; + + disp('Calculating distance matrix. This may take several minutes.'); + + pgdist = nchoosek(nPregroup,2); + tmpIndK=1; + for i=1:nPregroup + for j=i+1:nPregroup + tmpIndsI = find(groupPartition==i); + tmpIndsJ = find(groupPartition==j); + tmpNI = length(tmpIndsI); + tmpNJ = length(tmpIndsJ); + tmpSum=0; + for k=tmpIndsJ(:)' + tmp = alnMat(tmpIndsI,:)~=repmat(alnMat(k,:),tmpNI,1); + tmpSum = tmpSum+sum(tmp(:)); + end + + pgdist(tmpIndK)=tmpSum/tmpNI/tmpNJ; + tmpIndK = tmpIndK+1; + end + end + clear tmp* alnMat + end + + setWindowOnTop(base,'false') + [filename2,pathname2]=uigetfile('*.txt', 'Load group coordinates'); + if filename2==0 + return + end + + coordinates = load([pathname2 filename2]); + %viallinen = testaaKoordinaatit(ninds, coordinates); + [viallinen coordinates] = testaaKoordinaatit(ninds, coordinates); % added by Lu Cheng, 05.12.2012 + if viallinen + disp('Incorrect coordinates'); + return + end + + inp = [filename1 ' & ' filename1_1 ' & ' filename2]; + h0 = findobj('Tag','filename1_text'); + set(h0,'String',inp); + clear h0; clear inp; + clear filename1; clear filename2; clear pathname1; clear pathname2; + clear filename1_1 pathname1_1 + + load_names = questdlg('Do you wish to specify the names of the groups?',... + 'Input group names?','Yes','No','Yes'); +% load_names = 'No'; + if isequal(load_names,'Yes') + waitALittle; + [filename, pathname] = uigetfile('*.txt', 'Load group names'); + popnames = initPopNames([pathname filename]); + if (size(popnames,1)~=ninds) + disp('Incorrect name-file.'); + popnames = []; + else + popnames = popnames(:,1); + + end + else + popnames = []; + end + + disp('Pre-processing the data. This may take several minutes.'); + + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + + cc.locCliques = cliques; + cc.locSeparators = separators; + cc.popnames = popnames; + cc.vorPoints = vorPoints; + cc.vorCells = vorCells; + cc.pointers = pointers; + cc.coordinates = coordinates; + format_type = 'FASTA'; + + save_preproc = questdlg('Do you wish to save pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + kokonimi = [pathname filename]; + save(kokonimi,'cc','pgdist','format_type','groupPartition','-v7.3'); + end + + handlePopFastaCase(cc,groupPartition,pgdist); + + return; + +%%%%%%%%%%%%%add by Lu Cheng 11.11.2012 END %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +case 'GenePop-format' + waitALittle; + [filename1, pathname1] = uigetfile('*.txt', 'Load data in GenePop-format'); + if filename1==0 + return; + end + + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname1 filename1]); + end + + kunnossa = testaaGenePopData([pathname1 filename1]); + if kunnossa==0 + return + end + [data,popnames]=lueGenePopData([pathname1 filename1]); + + waitALittle; + [filename2,pathname2]=uigetfile('*.txt', 'Load group coordinates'); + if filename2==0 + return + end + + ninds = max(data(:,end)); + coordinates = load([pathname2 filename2]); + %viallinen = testaaKoordinaatit(ninds, coordinates); + [viallinen coordinates] = testaaKoordinaatit(ninds, coordinates); % added by Lu Cheng, 05.12.2012 + + if viallinen + disp('Incorrect coordinates'); + return + end + + inp = [filename1 ' & ' filename2]; + h0 = findobj('Tag','filename1_text'); + set(h0,'String',inp); + clear h0; clear inp; + clear filename1; clear filename2; clear pathname1; clear pathname2; + + disp('Pre-processing the data. This may take several minutes.'); + + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + [data, rows, alleleCodes, noalle, adjprior, priorTerm] = handleData(data); + [Z,dist] = newGetDistances(data,rows); + + rowsFromInd = 2; %Tiedetään + + save_preproc = questdlg('Do you wish to save pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + kokonimi = [pathname filename]; + c.data = data; c.rows = rows; c.alleleCodes = alleleCodes; + c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; + c.dist = dist; c.popnames = popnames; c.Z = Z; + c.cliques = cliques; c.separators = separators; + c.vorPoints = vorPoints; c.rowsFromInd = rowsFromInd; + c.vorCells = vorCells; c.pointers = pointers; + c.coordinates = coordinates; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + clear c; + end; + +case 'Preprocessed data' + waitALittle; + [filename, pathname] = uigetfile('*.mat', 'Load pre-processed data'); + if filename==0 + return; + end + if ~isempty(partitionCompare) + fprintf(1,'Data: %s\n',[pathname filename]); + end + h0 = findobj('Tag','filename1_text'); + set(h0,'String',filename); clear h0; + + struct_array = load([pathname filename]); + + if isfield(struct_array,'format_type') && strcmp(struct_array.format_type,'FASTA') + handlePopFastaCase(struct_array.cc,struct_array.groupPartition,struct_array.pgdist); + return; + end + + if isfield(struct_array,'c') %Matlab versio + c = struct_array.c; + if ~isfield(c,'dist') + disp('Incorrect file format'); + return + end + elseif isfield(struct_array,'dist') %Mideva versio + c = struct_array; + else + disp('Incorrect file format'); + return; + end + data = double(c.data); rows = c.rows; alleleCodes = c.alleleCodes; + noalle = c.noalle; adjprior = c.adjprior; priorTerm = c.priorTerm; + dist = c.dist; popnames = c.popnames; Z = c.Z; rowsFromInd = c.rowsFromInd; + + if isfield(c, 'cliques') + cliques = c.cliques; separators = c.separators; + vorPoints = c.vorPoints; vorCells = c.vorCells; + pointers = c.pointers; coordinates = c.coordinates; + clear c; + else + load_coord = questdlg(['The data file did not contain ',... + 'coordinate information. Do you wish to load coordinates?'], ... + 'Load coordinates?',... + 'Yes','No','Yes'); + if isequal(load_coord, 'No') + return + end + waitALittle; + [filename2,pathname2]=uigetfile('*.txt', 'Load group coordinates'); + if filename2==0 + return + end + + ninds = max(data(:,end)); + coordinates = load([pathname2 filename2]); + %viallinen = testaaKoordinaatit(ninds, coordinates); + [viallinen coordinates] = testaaKoordinaatit(ninds, coordinates); % added by Lu Cheng, 05.12.2012 + + if viallinen + disp('Incorrect coordinates'); + return + end + inp = [filename ' & ' filename2]; + h0 = findobj('Tag','filename1_text'); + set(h0,'String',inp); + clear h0; clear inp; + clear filename; clear filename2; clear pathname; clear pathname2; + + disp('Pre-processing the data. This may take several minutes.'); + [cliques, separators, vorPoints, vorCells, pointers] = ... + handleCoords(coordinates); + save_preproc = questdlg('Do you wish to save pre-processed data?',... + 'Save pre-processed data?',... + 'Yes','No','Yes'); + if isequal(save_preproc,'Yes'); + waitALittle; + [filename, pathname] = uiputfile('*.mat','Save pre-processed data as'); + kokonimi = [pathname filename]; + c.cliques = cliques; c.separators = separators; + c.vorPoints = vorPoints; c.vorCells = vorCells; + c.pointers = pointers; c.coordinates = coordinates; +% save(kokonimi,'c'); + save(kokonimi,'c','-v7.3'); % added by Lu Cheng, 08.06.2012 + clear c; + end; + end + otherwise + return +end + +global PARTITION; global COUNTS; +global SUMCOUNTS; %global POP_LOGML; +global SEPCOUNTS; global CLIQCOUNTS; +clearGlobalVars; + +c.data=data; c.alleleCodes = alleleCodes; +c.noalle = noalle; c.adjprior = adjprior; c.priorTerm = priorTerm; +c.dist=dist; c.Z=Z; c.rowsFromInd = rowsFromInd; +c.cliques = cliques; c.separators = separators; +c.rows = rows; + +% partition compare +if ~isempty(partitionCompare) + nsamplingunits = size(rows,1); + data = data(:,1:end-1); + partitions = partitionCompare.partitions; + npartitions = size(partitions,2); + partitionLogml = zeros(1,npartitions); + for i = 1:npartitions + % number of unique partition lables + npops = length(unique(partitions(:,i))); + try + partitionInd = zeros(rows(end),1); + partitionSample = partitions(:,i); + for j = 1:nsamplingunits + partitionInd([c.rows(j,1):c.rows(j,2)]) = partitionSample(j); + end + [sumcounts, counts] = initialCounts(partitionInd, data, npops, noalle); + [cliqcounts, sepcounts] = computeCounts(cliques, separators, npops, partitionSample); + partitionLogml(i) = ... + computeLogml(adjprior, priorTerm, cliqcounts, sepcounts, counts, sumcounts); + catch + disp('*** ERROR: unmatched data.'); + return + end + end + % return the logml result + partitionCompare.logmls = partitionLogml; + set(h1, 'userdata', partitionCompare); + return +end + +if fixedK + [logml, npops, partitionSummary]=spatialMix_fixK(c); +else + [logml, npops, partitionSummary]=spatialMix(c); +end + +if logml==1 + return +end + +lastCol = data(:,end); +data = data(:,1:end-1); + +h0 = findobj('Tag','filename1_text'); inp = get(h0,'String'); +h0 = findobj('Tag','filename2_text'); +outp = get(h0,'String'); +[varmuus,changesInLogml] = writeMixtureInfo(logml, rows, data, adjprior, priorTerm, ... + outp,inp,partitionSummary, popnames, cliques, separators, fixedK); + +%checkLogml(priorTerm, adjprior, cliques, separators); + +viewPopMixPartition(PARTITION, rows, popnames); + +if isequal(popnames, []) + names = pointers; +else + %Etsitään voronoi-soluja vastaavat nimet. + names = cell(size(pointers)); + indices = 1:length(popnames); + for i = 1:length(pointers) + inds = pointers{i}; + namesInCell = []; + for j = 1:length(inds) + ind = inds(j); + I = find(indices > ind); + if isempty(I) + nameIndex = indices(end); + else + nameIndex = min(I) -1; + end + name = popnames{nameIndex}; + namesInCell = [namesInCell name]; + end + names{i} = namesInCell; + end +end +vorPlot(vorPoints, vorCells, PARTITION, pointers, coordinates, names); + +talle = questdlg(['Do you want to save the mixture populations ' ... + 'so that you can use them later in admixture analysis or plot ' ... + 'additional images?'], ... + 'Save results?','Yes','No','Yes'); +if isequal(talle,'Yes') + waitALittle; + [filename, pathname] = uiputfile('*.mat','Save results as'); + + if (filename == 0) & (pathname == 0) + % Cancel was pressed + return + else % copy OUTPUT_FILE into the text file with the same name. + if exist(OUTPUT_FILE,'file') + copyfile(OUTPUT_FILE,[pathname filename '.txt']) + delete(OUTPUT_FILE) + end + end + + if rowsFromInd==0 + %Käytettiin BAPS-formaattia, eik?rowsFromInd ole tunnettu. + [popnames, rowsFromInd] = findOutRowsFromInd(popnames, rows); + end + + groupPartition = PARTITION; + + fiksaaPartitioYksiloTasolle(rows, rowsFromInd); + + c.PARTITION = PARTITION; c.COUNTS = COUNTS; c.SUMCOUNTS = SUMCOUNTS; + c.alleleCodes = alleleCodes; c.adjprior = adjprior; c.popnames = popnames; + c.rowsFromInd = rowsFromInd; c.data = data; c.npops = npops; + c.noalle = noalle; c.groupPartition = groupPartition; + c.pointers = pointers; c.vorPoints = vorPoints; c.vorCells = vorCells; + c.coordinates = coordinates; c.names = names; c.varmuus = varmuus; + c.rows = rows; c.mixtureType = 'spatialPop'; + c.logml = logml; c.changesInLogml = changesInLogml; + + % added by Lu Cheng, 05.12.2012 + tmpFile = [pathname filename '.mapfile.txt']; + fid = fopen(tmpFile,'w+'); + fprintf(fid,'GroupLabel\tLatitude\tLongitude\tDescription\tLabel\n'); + for i=1:max(lastCol) + fprintf(fid,'%d\t%.10f\t%.10f\t%d_%d\t%d\n',i,coordinates(i,1),coordinates(i,2),... + i,PARTITION(i),PARTITION(i)); + end + fclose(fid); + +% save([pathname filename], 'c'); + save([pathname filename], 'c', '-v7.3'); % added by Lu Cheng, 08.06.2012 +else + if exist(OUTPUT_FILE,'file') + delete(OUTPUT_FILE) + end +end + + + + + +%------------------------------------------------------------------------------------- +%------------------------------------------------------------------------------------- + +function clearGlobalVars + +global COUNTS; COUNTS = []; +global SUMCOUNTS; SUMCOUNTS = []; +global PARTITION; PARTITION = []; +%global POP_LOGML; POP_LOGML = []; +global SEPCOUNTS; SEPCOUNTS = []; +global CLIQCOUNTS; CLIQCOUNTS = []; + +%------------------------------------------------------------------------------------- + + +function rows = computeRows(rowsFromInd, inds, ninds) +% On annettu yksilöt inds. Funktio palauttaa vektorin, joka +% sisältää niiden rivien numerot, jotka sisältävät yksilöiden +% dataa. + +rows = inds(:, ones(1,rowsFromInd)); +rows = rows*rowsFromInd; +miinus = repmat(rowsFromInd-1 : -1 : 0, [ninds 1]); +rows = rows - miinus; +rows = reshape(rows', [1,rowsFromInd*ninds]); + + +%-------------------------------------------------------------------------- + + +function [partitionSummary, added] = addToSummary(logml, partitionSummary, worstIndex) +% Tiedetään, ett?annettu logml on isompi kuin huonoin arvo +% partitionSummary taulukossa. Jos partitionSummary:ss?ei viel?ole +% annettua logml arvoa, niin lisätään worstIndex:in kohtaan uusi logml ja +% nykyist?partitiota vastaava nclusters:in arvo. Muutoin ei tehd?mitään. + +apu = find(abs(partitionSummary(:,2)-logml)<1e-5); +if isempty(apu) + % Nyt löydetty partitio ei ole viel?kirjattuna summaryyn. + global PARTITION; + npops = length(unique(PARTITION)); + partitionSummary(worstIndex,1) = npops; + partitionSummary(worstIndex,2) = logml; + added = 1; +else + added = 0; +end + + +%-------------------------------------------------------------------------- + + +function [suurin, i2] = arvoSeuraavaTila(muutokset, logml) +% Suorittaa yksilön seuraavan tilan arvonnan + +y = logml + muutokset; % siirron jälkeiset logml:t +y = y - max(y); +y = exp(y); +summa = sum(y); +y = y/summa; +y = cumsum(y); + +i2 = rand_disc(y); % uusi kori +suurin = muutokset(i2); + + +%-------------------------------------------------------------------------------------- + + +function svar=rand_disc(CDF) +%returns an index of a value from a discrete distribution using inversion method +slump=rand; +har=find(CDF>slump); +svar=har(1); + + +%------------------------------------------------------------------------------------- + + +function updateGlobalVariables(ind, i2, diffInCounts, ... + cliques, separators, adjprior, priorTerm) +% Suorittaa globaalien muuttujien muutokset, kun yksil?ind +% siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +global CLIQCOUNTS; +global SEPCOUNTS; + +i1 = PARTITION(ind); +PARTITION(ind)=i2; + +diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); +diffInSepCounts = computeDiffInCliqCounts(separators, ind); + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + +%POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%--------------------------------------------------------------------------------- + + +function updateGlobalVariables2(i1, i2, diffInCounts, ... + cliques, separators, adjprior, priorTerm); +% Suorittaa globaalien muuttujien muutokset, kun kaikki +% korissa i1 olevat yksilöt siirretään koriin i2. + +global PARTITION; +global COUNTS; +global SUMCOUNTS; +%global POP_LOGML; +global CLIQCOUNTS; +global SEPCOUNTS; + +inds = find(PARTITION==i1); +PARTITION(inds) = i2; + +diffInCliqCounts = CLIQCOUNTS(:,i1); +diffInSepCounts = SEPCOUNTS(:,i1); + + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = 0; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = 0; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + + +%------------------------------------------------------------------------------------ + + +function updateGlobalVariables3(muuttuvat, diffInCounts, ... + adjprior, priorTerm, i2, cliques, separators); +% Suorittaa globaalien muuttujien päivitykset, kun yksilöt 'muuttuvat' +% siirretään koriin i2. Ennen siirtoa yksilöiden on kuuluttava samaan +% koriin. + +global PARTITION; +global COUNTS; global CLIQCOUNTS; +global SUMCOUNTS; global SEPCOUNTS; +%global POP_LOGML; + +i1 = PARTITION(muuttuvat(1)); +PARTITION(muuttuvat) = i2; + +diffInCliqCounts = computeDiffInCliqCounts(cliques, muuttuvat); +diffInSepCounts = computeDiffInCliqCounts(separators, muuttuvat); + +COUNTS(:,:,i1) = COUNTS(:,:,i1) - diffInCounts; +COUNTS(:,:,i2) = COUNTS(:,:,i2) + diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:) - sum(diffInCounts); +SUMCOUNTS(i2,:) = SUMCOUNTS(i2,:) + sum(diffInCounts); + +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +CLIQCOUNTS(:,i2) = CLIQCOUNTS(:,i2) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; +SEPCOUNTS(:,i2) = SEPCOUNTS(:,i2) + diffInSepCounts; + +%POP_LOGML([i1 i2]) = computePopulationLogml([i1 i2], adjprior, priorTerm); + + +%---------------------------------------------------------------------- + + +function inds = returnInOrder(inds, pop, globalRows, data, ... + adjprior, priorTerm) +% Palauttaa yksilöt järjestyksess?siten, ett?ensimmäisen?on +% se, jonka poistaminen populaatiosta pop nostaisi logml:n +% arvoa eniten. + +global COUNTS; global SUMCOUNTS; +ninds = length(inds); +apuTaulu = [inds, zeros(ninds,1)]; + +for i=1:ninds + ind =inds(i); + rows = globalRows(i,1):globalRows(i,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + COUNTS(:,:,pop) = COUNTS(:,:,pop)-diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)-diffInSumCounts; + apuTaulu(i, 2) = computePopulationLogml(pop, adjprior, priorTerm); + COUNTS(:,:,pop) = COUNTS(:,:,pop)+diffInCounts; + SUMCOUNTS(pop,:) = SUMCOUNTS(pop,:)+diffInSumCounts; +end +apuTaulu = sortrows(apuTaulu,2); +inds = apuTaulu(ninds:-1:1,1); + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset(ind, globalRows, ... + data, adjprior, priorTerm, logml, cliques, separators) +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksilöt inds siirretään koriin i. +% diffInCounts on poistettava COUNTS:in siivusta i1 ja lisättäv? +% COUNTS:in siivuun i2, mikäli muutos toteutetaan. +% Huom! Laskee muutoksen vain yhdelle tyhjälle populaatiolle, muiille +% tyhjille tulee muutokseksi 0. + +global COUNTS; global SUMCOUNTS; +global PARTITION; %global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +counts = COUNTS; +sumcounts = SUMCOUNTS; + +[emptyPop, pops] = findEmptyPop(npops); + +i1 = PARTITION(ind); + +i2 = [pops(find(pops~=i1))]; +if emptyPop > 0 + i2 =[i2 emptyPop]; +end + +i2 = sort(i2); + +rows = globalRows(ind,1):globalRows(ind,2); +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); + +diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); +diffInSepCounts = computeDiffInCliqCounts(separators, ind); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; + +for i=i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; +end + +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; +CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) + diffInCliqCounts; +SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) + diffInSepCounts; + +% Asetetaan muillekin tyhjille populaatioille sama muutos, kuin +% emptyPop:lle + +if emptyPop > 0 + empties = mysetdiff((1:npops), [i2 i1]); + muutokset(empties) = muutokset(emptyPop); +end + +COUNTS = counts; +SUMCOUNTS = sumcounts; + +%------------------------------------------------------------------------------------ + + +function [muutokset, diffInCounts] = laskeMuutokset2(i1, globalRows, ... + data, adjprior, priorTerm, logml, cliques, separators); +% Palauttaa npops*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli korin i1 kaikki yksilöt siirretään +% koriin i. +% Laskee muutokset vain yhdelle tyhjälle populaatiolle, muille tulee +% muutokseksi 0. + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +muutokset = zeros(npops,1); + +[emptyPop, pops] = findEmptyPop(npops); + +i2 = [pops(find(pops~=i1))]; +if emptyPop > 0 + i2 =[i2 emptyPop]; +end + +inds = find(PARTITION == i1); +ninds = length(inds); + +rows = []; +for i = 1:ninds + rows = [rows globalRows(inds(i),1):globalRows(inds(i),2)]; +end +diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); +diffInSumCounts = sum(diffInCounts); +diffInCliqCounts = computeDiffInCliqCounts(cliques, inds); +diffInSepCounts = computeDiffInCliqCounts(separators, inds); + +COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; +CLIQCOUNTS(:,i1) = 0; +SEPCOUNTS(:,i1) = 0; + +for i=i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; +end + +COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; +SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; +CLIQCOUNTS(:,i1) = diffInCliqCounts; +SEPCOUNTS(:,i1) = diffInSepCounts; + + + +%------------------------------------------------------------------------------------ + +function muutokset = laskeMuutokset3(T2, inds2, globalRows, ... + data, adjprior, priorTerm, i1, logml, cliques, separators) +% Palauttaa length(unique(T2))*npops taulun, jossa (i,j):s alkio +% kertoo, mik?olisi muutos logml:ss? jos populaation i1 osapopulaatio +% inds2(find(T2==i)) siirretään koriin j. +% Laskee vain yhden tyhjän populaation, muita kohden muutokseksi jää 0. + + +global COUNTS; global SUMCOUNTS; +global PARTITION; global POP_LOGML; +global CLIQCOUNTS; global SEPCOUNTS; + +npops = size(COUNTS,3); +npops2 = length(unique(T2)); +muutokset = zeros(npops2, npops); + +for pop2 = 1:npops2 + inds = inds2(find(T2==pop2)); + ninds = length(inds); + if ninds>0 + rows = []; + for i = 1:ninds + ind = inds(i); + rows = [rows; (globalRows(ind,1):globalRows(ind,2))']; + end + diffInCounts = computeDiffInCounts(rows', size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + diffInCliqCounts = computeDiffInCliqCounts(cliques, inds); + diffInSepCounts = computeDiffInCliqCounts(separators, inds); + + COUNTS(:,:,i1) = COUNTS(:,:,i1)-diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)-diffInSumCounts; + CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) - diffInCliqCounts; + SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) - diffInSepCounts; + + [emptyPop, pops] = findEmptyPop(npops); + i2 = [pops(find(pops~=i1))]; + if emptyPop > 0 + i2 =[i2 emptyPop]; + end + + for i = i2 + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) + diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) + diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) + diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) + diffInSumCounts; + + muutokset(pop2,i) = computeLogml(adjprior, priorTerm) - logml; + + CLIQCOUNTS(:,i) = CLIQCOUNTS(:,i) - diffInCliqCounts; + SEPCOUNTS(:,i) = SEPCOUNTS(:,i) - diffInSepCounts; + COUNTS(:,:,i) = COUNTS(:,:,i) - diffInCounts; + SUMCOUNTS(i,:) = SUMCOUNTS(i,:) - diffInSumCounts; + end + + COUNTS(:,:,i1) = COUNTS(:,:,i1)+diffInCounts; + SUMCOUNTS(i1,:) = SUMCOUNTS(i1,:)+diffInSumCounts; + CLIQCOUNTS(:,i1) = CLIQCOUNTS(:,i1) + diffInCliqCounts; + SEPCOUNTS(:,i1) = SEPCOUNTS(:,i1) + diffInSepCounts; + end +end + +%-------------------------------------------------------------------------- +function muutokset = laskeMuutokset5(inds, globalRows, data, ... + adjprior, priorTerm, logml, cliques, separators, i1, i2) + +% Palauttaa length(inds)*1 taulun, jossa i:s alkio kertoo, mik?olisi +% muutos logml:ss? mikäli yksil?i vaihtaisi koria i1:n ja i2:n välill? + +global COUNTS; global SUMCOUNTS; +global PARTITION; +global CLIQCOUNTS; global SEPCOUNTS; + +ninds = length(inds); +muutokset = zeros(ninds,1); + +for i = 1:ninds + ind = inds(i); + + rows = globalRows(ind,1):globalRows(ind,2); + diffInCounts = computeDiffInCounts(rows, size(COUNTS,1), size(COUNTS,2), data); + diffInSumCounts = sum(diffInCounts); + + if PARTITION(ind)==i1 + pop1 = i1; %mist? + pop2 = i2; %mihin + else + pop1 = i2; + pop2 = i1; + end + + diffInCliqCounts = computeDiffInCliqCounts(cliques, ind); + diffInSepCounts = computeDiffInCliqCounts(separators, ind); + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)-diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)-diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)+diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)+diffInSumCounts; + + CLIQCOUNTS(:,pop1) = CLIQCOUNTS(:,pop1) - diffInCliqCounts; + CLIQCOUNTS(:,pop2) = CLIQCOUNTS(:,pop2) + diffInCliqCounts; + SEPCOUNTS(:,pop1) = SEPCOUNTS(:,pop1) - diffInSepCounts; + SEPCOUNTS(:,pop2) = SEPCOUNTS(:,pop2) + diffInSepCounts; + + muutokset(i) = computeLogml(adjprior, priorTerm) - logml; + + COUNTS(:,:,pop1) = COUNTS(:,:,pop1)+diffInCounts; + SUMCOUNTS(pop1,:) = SUMCOUNTS(pop1,:)+diffInSumCounts; + COUNTS(:,:,pop2) = COUNTS(:,:,pop2)-diffInCounts; + SUMCOUNTS(pop2,:) = SUMCOUNTS(pop2,:)-diffInSumCounts; + + CLIQCOUNTS(:,pop1) = CLIQCOUNTS(:,pop1) + diffInCliqCounts; + CLIQCOUNTS(:,pop2) = CLIQCOUNTS(:,pop2) - diffInCliqCounts; + SEPCOUNTS(:,pop1) = SEPCOUNTS(:,pop1) + diffInSepCounts; + SEPCOUNTS(:,pop2) = SEPCOUNTS(:,pop2) - diffInSepCounts; +end + +%-------------------------------------------------------------------------- + +function diffInCounts = computeDiffInCounts(rows, max_noalle, nloci, data) +% Muodostaa max_noalle*nloci taulukon, jossa on niiden alleelien +% lukumäärät (vastaavasti kuin COUNTS:issa), jotka ovat data:n +% riveill?rows. + +diffInCounts = zeros(max_noalle, nloci); +for i=rows + row = data(i,:); + notEmpty = find(row>=0); + + if length(notEmpty)>0 + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) = ... + diffInCounts(row(notEmpty) + (notEmpty-1)*max_noalle) + 1; + end +end + + + +%------------------------------------------------------------------------------------ + + +function popLogml = computePopulationLogml(pops, adjprior, priorTerm) +% Palauttaa length(pops)*1 taulukon, jossa on laskettu korikohtaiset +% logml:t koreille, jotka on määritelty pops-muuttujalla. + +global COUNTS; +global SUMCOUNTS; +x = size(COUNTS,1); +y = size(COUNTS,2); +z = length(pops); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 length(pops)]) + COUNTS(:,:,pops)) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS(pops,:)),2) - priorTerm; + +%------------------------------------------------------------------------------------ + +function npops = poistaTyhjatPopulaatiot(npops) +% Poistaa tyhjentyneet populaatiot COUNTS:ista ja +% SUMCOUNTS:ista. Päivittää npops:in ja PARTITION:in. + +global COUNTS; +global SUMCOUNTS; +global PARTITION; +global CLIQCOUNTS; +global SEPCOUNTS; + +notEmpty = find(any(SUMCOUNTS,2)); +COUNTS = COUNTS(:,:,notEmpty); +SUMCOUNTS = SUMCOUNTS(notEmpty,:); +CLIQCOUNTS = CLIQCOUNTS(:,notEmpty); +SEPCOUNTS = SEPCOUNTS(:,notEmpty); + +for n=1:length(notEmpty) + apu = find(PARTITION==notEmpty(n)); + PARTITION(apu)=n; +end +npops = length(notEmpty); + + +%---------------------------------------------------------------------------------- +%Seuraavat kolme funktiota liittyvat alkupartition muodostamiseen. + +function initial_partition=admixture_initialization(data_matrix,nclusters,Z) +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +T=cluster_own(Z,nclusters); +initial_partition=zeros(size_data(1),1); +for i=1:n + kori=T(i); + here=find(data_matrix(:,end)==i); + for j=1:length(here) + initial_partition(here(j),1)=kori; + end +end + +function T = cluster_own(Z,nclust) +true=logical(1); +false=logical(0); +maxclust = nclust; +% Start of algorithm +m = size(Z,1)+1; +T = zeros(m,1); + % maximum number of clusters based on inconsistency + if m <= maxclust + T = (1:m)'; + elseif maxclust==1 + T = ones(m,1); + else + clsnum = 1; + for k = (m-maxclust+1):(m-1) + i = Z(k,1); % left tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + i = Z(k,2); % right tree + if i <= m % original node, no leafs + T(i) = clsnum; + clsnum = clsnum + 1; + elseif i < (2*m-maxclust+1) % created before cutoff, search down the tree + T = clusternum(Z, T, i-m, clsnum); + clsnum = clsnum + 1; + end + end + end + +function T = clusternum(X, T, k, c) +m = size(X,1)+1; +while(~isempty(k)) + % Get the children of nodes at this level + children = X(k,1:2); + children = children(:); + + % Assign this node number to leaf children + t = (children<=m); + T(children(t)) = c; + + % Move to next level + k = children(~t) - m; +end + + +%--------------------------------------------------------------------------------------- + + +function [newData, rows, alleleCodes, noalle, adjprior, priorTerm] = ... + handleData(raw_data) +% Alkuperäisen datan viimeinen sarake kertoo, milt?yksilölt? +% kyseinen rivi on peräisin. Funktio tutkii ensin, ett?montako +% rivi?maksimissaan on peräisin yhdelt?yksilölt? jolloin saadaan +% tietää onko kyseess?haploidi, diploidi jne... Tämän jälkeen funktio +% lisää tyhji?rivej?niille yksilöille, joilta on peräisin vähemmän +% rivej?kuin maksimimäär? +% Mikäli jonkin alleelin koodi on =0, funktio muuttaa tämän alleelin +% koodi pienimmäksi koodiksi, joka isompi kuin mikään käytöss?oleva koodi. +% Tämän jälkeen funktio muuttaa alleelikoodit siten, ett?yhden lokuksen j +% koodit saavat arvoja välill?1,...,noalle(j). +% +% Muutettu vastaamaan greedyPopMixin handlePopDataa. + +data = raw_data; +nloci=size(raw_data,2)-1; + +dataApu = data(:,1:nloci); +nollat = find(dataApu==0); +if ~isempty(nollat) + isoinAlleeli = max(max(dataApu)); + dataApu(nollat) = isoinAlleeli+1; + data(:,1:nloci) = dataApu; +end +dataApu = []; nollat = []; isoinAlleeli = []; + +noalle=zeros(1,nloci); +alleelitLokuksessa = cell(nloci,1); +for i=1:nloci + alleelitLokuksessaI = unique(data(:,i)); + alleelitLokuksessa{i,1} = alleelitLokuksessaI(find(alleelitLokuksessaI>=0)); + noalle(i) = length(alleelitLokuksessa{i,1}); +end +alleleCodes = zeros(max(noalle),nloci); +for i=1:nloci + alleelitLokuksessaI = alleelitLokuksessa{i,1}; + puuttuvia = max(noalle)-length(alleelitLokuksessaI); + alleleCodes(:,i) = [alleelitLokuksessaI; zeros(puuttuvia,1)]; +end + +for loc = 1:nloci + for all = 1:noalle(loc) + data(find(data(:,loc)==alleleCodes(all,loc)), loc)=all; + end; +end; + +nind = max(data(:,end)); +%rows = cell(nind,1); +rows = zeros(nind,2); +for i=1:nind + rivit = find(data(:,end)==i)'; + rows(i,1) = min(rivit); + rows(i,2) = max(rivit); +end +newData = data; + +adjprior = zeros(max(noalle),nloci); +priorTerm = 0; +for j=1:nloci + adjprior(:,j) = [repmat(1/noalle(j), [noalle(j),1]) ; ones(max(noalle)-noalle(j),1)]; + priorTerm = priorTerm + noalle(j)*gammaln(1/noalle(j)); +end + + +%---------------------------------------------------------------------------------------- + +function [Z, dist] = newGetDistances(data, initRows) + +ninds = size(initRows,1); +nloci = size(data,2)-1; +riviLkm = nchoosek(ninds,2); + +empties = find(data<0); +data(empties)=0; +data = uint8(data); % max(noalle) oltava <256 + +pariTaulu = zeros(riviLkm,2); +aPointer=1; +for a=1:ninds-1 + pariTaulu(aPointer:aPointer+ninds-1-a,1) = ones(ninds-a,1)*a; + pariTaulu(aPointer:aPointer+ninds-1-a,2) = (a+1:ninds)'; + aPointer = aPointer+ninds-a; +end + +%eka = pariTaulu(:,ones(1,rowsFromInd)); +%eka = eka * rowsFromInd; +%miinus = repmat(rowsFromInd-1 : -1 : 0, [riviLkm 1]); +%eka = eka - miinus; + +koot = initRows(:,2) - initRows(:,1); +maxSize = max(koot) + 1; + +rows = zeros(ninds, maxSize); + +for i=1:ninds + apu = initRows(i,1):initRows(i,2); + rows(i, 1:length(apu)) = apu; +end +eka = zeros(riviLkm, maxSize); +toka = zeros(riviLkm, maxSize); + +for i = 1:riviLkm + eka(i, :) = rows(pariTaulu(i, 1), :); + toka(i, :) = rows(pariTaulu(i,2), :); +end + +%eka = uint16(eka); +%toka = uint16(toka); + +summa = zeros(riviLkm,1); +vertailuja = zeros(riviLkm,1); + +clear pariTaulu; clear miinus; + +x = zeros(size(eka)); x = uint8(x); +y = zeros(size(toka)); y = uint8(y); + +for j=1:nloci; + + for k=1:maxSize + I = find(eka(:,k)>0); + x(I,k) = data(eka(I,k),j); + I = find(toka(:,k)>0); + y(I,k) = data(toka(I,k),j); + end + + for a=1:maxSize + for b=1:maxSize + vertailutNyt = double(x(:,a)>0 & y(:,b)>0); + vertailuja = vertailuja + vertailutNyt; + lisays = (x(:,a)~=y(:,b) & vertailutNyt); + summa = summa+double(lisays); + end + end +end + +clear x; clear y; clear vertailutNyt; +nollat = find(vertailuja==0); +dist = zeros(length(vertailuja),1); +dist(nollat) = 1; +muut = find(vertailuja>0); +dist(muut) = summa(muut)./vertailuja(muut); +clear summa; clear vertailuja; + +Z = linkage(dist'); + +%---------------------------------------------------------------------------------------- + + +function [Z, distances]=getDistances(data_matrix,nclusters) + +%finds initial admixture clustering solution with nclusters clusters, uses simple mean Hamming distance +%gives partition in 8-bit format +%allocates all alleles of a single individual into the same basket +%data_matrix contains #Loci+1 columns, last column indicate whose alleles are placed in each row, +%i.e. ranges from 1 to #individuals. For diploids there are 2 rows per individual, for haploids only a single row +%missing values are indicated by zeros in the partition and by negative integers in the data_matrix. + +size_data=size(data_matrix); +nloci=size_data(2)-1; +n=max(data_matrix(:,end)); +distances=zeros(nchoosek(n,2),1); +pointer=1; +for i=1:n-1 + i_data=data_matrix(find(data_matrix(:,end)==i),1:nloci); + for j=i+1:n + d_ij=0; + j_data=data_matrix(find(data_matrix(:,end)==j),1:nloci); + vertailuja = 0; + for k=1:size(i_data,1) + for l=1:size(j_data,1) + here_i=find(i_data(k,:)>=0); + here_j=find(j_data(l,:)>=0); + here_joint=intersect(here_i,here_j); + vertailuja = vertailuja + length(here_joint); + d_ij = d_ij + length(find(i_data(k,here_joint)~=j_data(l,here_joint))); + end + end + d_ij = d_ij / vertailuja; + distances(pointer)=d_ij; + pointer=pointer+1; + end +end + +Z=linkage(distances'); + + + +%---------------------------------------------------------------------------------------- + + +function Z = linkage(Y, method) +[k, n] = size(Y); +m = (1+sqrt(1+8*n))/2; +if k ~= 1 | m ~= fix(m) + error('The first input has to match the output of the PDIST function in size.'); +end +if nargin == 1 % set default switch to be 'co' + method = 'co'; +end +method = lower(method(1:2)); % simplify the switch string. +monotonic = 1; +Z = zeros(m-1,3); % allocate the output matrix. +N = zeros(1,2*m-1); +N(1:m) = 1; +n = m; % since m is changing, we need to save m in n. +R = 1:n; +for s = 1:(n-1) + X = Y; + [v, k] = min(X); + i = floor(m+1/2-sqrt(m^2-m+1/4-2*(k-1))); + j = k - (i-1)*(m-i/2)+i; + Z(s,:) = [R(i) R(j) v]; % update one more row to the output matrix A + I1 = 1:(i-1); I2 = (i+1):(j-1); I3 = (j+1):m; % these are temp variables. + U = [I1 I2 I3]; + I = [I1.*(m-(I1+1)/2)-m+i i*(m-(i+1)/2)-m+I2 i*(m-(i+1)/2)-m+I3]; + J = [I1.*(m-(I1+1)/2)-m+j I2.*(m-(I2+1)/2)-m+j j*(m-(j+1)/2)-m+I3]; + + switch method + case 'si' %single linkage + Y(I) = min(Y(I),Y(J)); + case 'av' % average linkage + Y(I) = Y(I) + Y(J); + case 'co' %complete linkage + Y(I) = max(Y(I),Y(J)); + case 'ce' % centroid linkage + K = N(R(i))+N(R(j)); + Y(I) = (N(R(i)).*Y(I)+N(R(j)).*Y(J)-(N(R(i)).*N(R(j))*v^2)./K)./K; + case 'wa' + Y(I) = ((N(R(U))+N(R(i))).*Y(I) + (N(R(U))+N(R(j))).*Y(J) - ... + N(R(U))*v)./(N(R(i))+N(R(j))+N(R(U))); + end + J = [J i*(m-(i+1)/2)-m+j]; + Y(J) = []; % no need for the cluster information about j. + + % update m, N, R + m = m-1; + N(n+s) = N(R(i)) + N(R(j)); + R(i) = n+s; + R(j:(n-1))=R((j+1):n); +end + + +%----------------------------------------------------------------------------------- + + +function popnames = initPopNames(nameFile) + +fid = fopen(nameFile); +if fid == -1 + %File didn't exist + msgbox('Loading of the population names was unsuccessful', ... + 'Error', 'error'); + return; +end; +line = fgetl(fid); +counter = 1; +while (line ~= -1) & ~isempty(line) + names{counter} = line; + line = fgetl(fid); + counter = counter + 1; +end; +fclose(fid); + +popnames = cell(length(names), 2); +for i = 1:length(names) + popnames{i,1} = names(i); + popnames{i,2} = 0; +end + + +%----------------------------------------------------------------------------------- +% Laskee arvot cliqcounts:lle ja sepcounts:lle + +function [cliqcounts, sepcounts] = computeCounts(cliques, separators, npops, PARTITION) + +ncliq = size(cliques,1); +nsep = size(separators,1); + +cliqPartition = zeros(ncliq, size(cliques,2)); +sepPartition = zeros(nsep, size(separators, 2)); + +apuCliq = find(cliques > 0); +apuSep = find(separators > 0); + +cliqPartition(apuCliq) = PARTITION(cliques(apuCliq)); +sepPartition(apuSep) = PARTITION(separators(apuSep)); + + +cliqcounts = zeros(ncliq, npops); +for i = 1:npops + cliqcounts(:,i) = sum(cliqPartition == i, 2); +end + + +sepcounts = zeros(nsep, npops); +for i = 1:npops + sepcounts(:,i) = sum(sepPartition == i, 2); +end + +%------------------------------------------------------------------------- + +function diffInCliqCounts = computeDiffInCliqCounts(cliques, inds) +% Laskee muutoksen CLIQCOUNTS:ssa (tai SEPCOUNTS:ssa, jos syötteen? +% separators) kun yksilöt inds siirretään. +% diffInCliqcounts on ncliq*1 taulu, joka on CLIQCOUNTS:n sarakkeesta josta +% yksilöt inds siirretään ja lisättäv?sarakkeeseen, johon yksilöt +% siirretään. + +ncliq = size(cliques,1); +diffInCliqCounts = zeros(ncliq,1); +ninds = length(inds); +for i = 1:ninds + ind = inds(i); + rivit = sum((cliques == ind),2); + diffInCliqCounts = diffInCliqCounts + rivit; +end + +%----------------------------------------------------------------------- + +function [logml, spatialPrior] = computeLogml(adjprior,priorTerm, ... + CLIQCOUNTS, SEPCOUNTS, ... + COUNTS, SUMCOUNTS) + +notEmpty = any(CLIQCOUNTS); +npops = length(find(notEmpty == 1)); +sumcliq=sum(CLIQCOUNTS, 2); +sumsep=sum(SEPCOUNTS, 2); +ncliq = size(CLIQCOUNTS, 1); +nsep = size(SEPCOUNTS, 1); + +cliqsizes = sum(CLIQCOUNTS, 2)'; +sepsizes = sum(SEPCOUNTS, 2)'; +cliqsizes = min([cliqsizes; npops*ones(1,ncliq)])'; +sepsizes = min([sepsizes; npops*ones(1,nsep)])'; + +klikkitn = sum(sum(gammaln(CLIQCOUNTS(:,notEmpty) + repmat(1./cliqsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./cliqsizes))) ... + - sum(gammaln(sumcliq + 1)); + +septn = sum(sum(gammaln(SEPCOUNTS(:,notEmpty) + repmat(1./sepsizes, [1 npops])))) ... + - sum(npops*(gammaln(1./sepsizes))) ... + - sum(gammaln(sumsep + 1)); + + +%klikkitn = sum(sum(gammaln(CLIQCOUNTS + 1/npops))) ... +% - ncliq*npops*(gammaln(1/npops)) ... +% - sum(gammaln(sumcliq + 1)); +%septn = sum(sum(gammaln(SEPCOUNTS + 1/npops))) ... +% - nsep*npops*(gammaln(1/npops)) ... +% - sum(gammaln(sumsep + 1)); + +spatialPrior = (klikkitn - septn); + +%if spatialPrior > 0 +% keyboard +%end + +x = size(COUNTS,1); +y = size(COUNTS,2); +z = size(COUNTS,3); + +popLogml = ... + squeeze(sum(sum(reshape(... + gammaln(repmat(adjprior,[1 1 z]) + COUNTS) ... + ,[x y z]),1),2)) - sum(gammaln(1+SUMCOUNTS),2) - priorTerm; + +logml = sum(popLogml) + spatialPrior; +%-------------------------------------------------------------------------- + + +function initializeGammaln(ninds, rowsFromInd, maxSize) +%Alustaa GAMMALN muuttujan s.e. GAMMALN(i,j)=gammaln((i-1) + 1/j) +global GAMMA_LN; +GAMMA_LN = zeros((1+ninds)*rowsFromInd, maxSize); +for i=1:(ninds+1)*rowsFromInd + for j=1:maxSize + GAMMA_LN(i,j)=gammaln((i-1) + 1/j); + end +end + + +%---------------------------------------------------------------------------- + + +function dist2 = laskeOsaDist(inds2, dist, ninds) +% Muodostaa dist vektorista osavektorin, joka sisältää yksilöiden inds2 +% väliset etäisyydet. ninds=kaikkien yksilöiden lukumäär? + +ninds2 = length(inds2); +apu = zeros(nchoosek(ninds2,2),2); +rivi = 1; +for i=1:ninds2-1 + for j=i+1:ninds2 + apu(rivi, 1) = inds2(i); + apu(rivi, 2) = inds2(j); + rivi = rivi+1; + end +end +apu = (apu(:,1)-1).*ninds - apu(:,1) ./ 2 .* (apu(:,1)-1) + (apu(:,2)-apu(:,1)); +dist2 = dist(apu); + + +%---------------------------------------------------------------------------- + + + +function kunnossa = testaaGenePopData(tiedostonNimi) +% kunnossa == 0, jos data ei ole kelvollinen genePop data. +% Muussa tapauksessa kunnossa == 1. + +kunnossa = 0; +fid = fopen(tiedostonNimi); +line1 = fgetl(fid); %ensimmäinen rivi +line2 = fgetl(fid); %toinen rivi +line3 = fgetl(fid); %kolmas + +if (isequal(line1,-1) | isequal(line2,-1) | isequal(line3,-1)) + disp('Incorrect file format 1168'); fclose(fid); + return +end +if (testaaPop(line1)==1 || testaaPop(line2)==1) + disp('Incorrect file format 1172'); fclose(fid); + return +end +if testaaPop(line3)==1 + %2 rivi tällöin lokusrivi + nloci = rivinSisaltamienMjonojenLkm(line2); + line4 = fgetl(fid); + if isequal(line4,-1) + disp('Incorrect file format 1180'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin nelj?täytyy sisältää pilkku. + disp('Incorrect file format 1185'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy + pointer = pointer+1; + end + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1195'); fclose(fid); + return + end +else + line = fgetl(fid); + lineNumb = 4; + while (testaaPop(line)~=1 & ~isequal(line,-1)) + line = fgetl(fid); + lineNumb = lineNumb+1; + end + if isequal(line,-1) + disp('Incorrect file format 1206'); fclose(fid); + return + end + nloci = lineNumb-2; + line4 = fgetl(fid); %Eka rivi pop sanan jälkeen + if isequal(line4,-1) + disp('Incorrect file format 1212'); fclose(fid); + return + end + if ~any(line4==',') + % Rivin täytyy sisältää pilkku. + disp('Incorrect file format 1217'); fclose(fid); + return + end + pointer = 1; + while ~isequal(line4(pointer),',') %Tiedetään, ett?pysähtyy. + pointer = pointer+1; + end + + line4 = line4(pointer+1:end); %pilkun jälkeinen osa + nloci2 = rivinSisaltamienMjonojenLkm(line4); + if (nloci2~=nloci) + disp('Incorrect file format 1228'); fclose(fid); + return + end +end +kunnossa = 1; +fclose(fid); + +%------------------------------------------------------ + + +function [data, popnames] = lueGenePopData(tiedostonNimi) + +fid = fopen(tiedostonNimi); +line = fgetl(fid); %ensimmäinen rivi +line = fgetl(fid); %toinen rivi +count = rivinSisaltamienMjonojenLkm(line); + +line = fgetl(fid); +lokusRiveja = 1; +while (testaaPop(line)==0) + lokusRiveja = lokusRiveja+1; + line = fgetl(fid); +end + +if lokusRiveja>1 + nloci = lokusRiveja; +else + nloci = count; +end + +popnames = cell(10,2); +data = zeros(100, nloci+1); +nimienLkm=0; +ninds=0; +poimiNimi=1; +digitFormat = -1; +while line ~= -1 + line = fgetl(fid); + + if poimiNimi==1 + %Edellinen rivi oli 'pop' + nimienLkm = nimienLkm+1; + ninds = ninds+1; + if nimienLkm>size(popnames,1); + popnames = [popnames; cell(10,2)]; + end + nimi = lueNimi(line); + if digitFormat == -1 + digitFormat = selvitaDigitFormat(line); + divider = 10^digitFormat; + end + popnames{nimienLkm, 1} = {nimi}; %Näin se on greedyMix:issäkin?!? + popnames{nimienLkm, 2} = ninds; + poimiNimi=0; + + data = addAlleles(data, ninds, line, divider); + + elseif testaaPop(line) + poimiNimi = 1; + + elseif line ~= -1 + ninds = ninds+1; + data = addAlleles(data, ninds, line, divider); + end +end + +data = data(1:ninds*2,:); +popnames = popnames(1:nimienLkm,:); +fclose(fid); + +npops = size(popnames,1); +ind = 1; +for pop = 1:npops + if pop0 + fid = fopen(outPutFile,'a'); +else + fid = -1; + diary(OUTPUT_FILE); % save in text anyway. +end + +dispLine; +disp('RESULTS OF GROUP LEVEL MIXTURE ANALYSIS:'); +disp(['Data file: ' inputFile]); +disp(['Number of clustered groups: ' ownNum2Str(ninds)]); +disp(['Number of clusters in optimal partition: ' ownNum2Str(npops)]); +disp(['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); +disp(' '); +if (fid ~= -1) + fprintf(fid,'%s \n', ['RESULTS OF GROUP LEVEL MIXTURE ANALYSIS:']); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Data file: ' inputFile]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clustered groups: ' ownNum2Str(ninds)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Number of clusters in optimal partition: ' ownNum2Str(npops)]); fprintf(fid,'\n'); + fprintf(fid,'%s \n', ['Log(marginal likelihood) of optimal partition: ' ownNum2Str(logml)]); fprintf(fid,'\n'); +end + +cluster_count = length(unique(PARTITION)); +disp(['Best Partition: ']); +if (fid ~= -1) + fprintf(fid,'%s \n',['Best Partition: ']); fprintf(fid,'\n'); +end +for m=1:cluster_count + indsInM = find(PARTITION==m); + length_of_beginning = 11 + floor(log10(m)); + cluster_size = length(indsInM); + + if names + text = ['Cluster ' num2str(m) ': {' char(popnames{indsInM(1)})]; + for k = 2:cluster_size + text = [text ', ' char(popnames{indsInM(k)})]; + end; + else + text = ['Cluster ' num2str(m) ': {' num2str(indsInM(1))]; + for k = 2:cluster_size + text = [text ', ' num2str(indsInM(k))]; + end; + end + text = [text '}']; + while length(text)>58 + %Take one line and display it. + new_line = takeLine(text,58); + text = text(length(new_line)+1:end); + disp(new_line); + if (fid ~= -1) + fprintf(fid,'%s \n',[new_line]); + fprintf(fid,'\n'); + end + if length(text)>0 + text = [blanks(length_of_beginning) text]; + else + text = []; + end; + end; + if ~isempty(text) + disp(text); + if (fid ~= -1) + fprintf(fid,'%s \n',[text]); + fprintf(fid,'\n'); + end + end +end + +disp(' '); +disp(' '); +disp('Changes in log(marginal likelihood) if group i is moved to cluster j:'); +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Changes in log(marginal likelihood) if group i is moved to cluster j:']); fprintf(fid, '\n'); +end + +if names + nameSizes = zeros(ninds,1); + for i = 1:ninds + nimi = char(popnames{i}); + nameSizes(i) = length(nimi); + end + maxSize = max(nameSizes); + maxSize = max(maxSize, 5); + erotus = maxSize - 5; + alku = blanks(erotus); + ekarivi = [alku 'group' blanks(6+erotus)]; +else + ekarivi = 'group '; +end + +for i = 1:cluster_count + ekarivi = [ekarivi ownNum2Str(i) blanks(8-floor(log10(i)))]; +end +disp(ekarivi); +if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); fprintf(fid, '\n'); +end + +varmuus = zeros(ninds,1); +changesInLogml = LOGDIFF'; + +for ind = 1:ninds + %[muutokset, diffInCounts] = laskeMuutokset(ind, globalRows, data, ... + % adjprior, priorTerm, logml, cliques, separators); + %changesInLogml(:,ind) = muutokset; + muutokset = changesInLogml(:,ind); + if sum(exp(muutokset))>0 + varmuus(ind) = 1 - 1/sum(exp(muutokset)); + else + varmuus(ind) = 0; + end + if names + nimi = char(popnames{ind}); + rivi = [blanks(maxSize - length(nimi)) nimi ':']; + else + rivi = [blanks(4-floor(log10(ind))) ownNum2Str(ind) ':']; + end + for j = 1:npops + rivi = [rivi ' ' logml2String(omaRound(muutokset(j)))]; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); + end +end + +disp(' '); disp(' '); +disp('KL-divergence matrix:'); +dist_mat = zeros(npops, npops); +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); %fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['KL-divergence matrix in PHYLIP format:']); %fprintf(fid, '\n'); +end + +maxnoalle = size(COUNTS,1); +nloci = size(COUNTS,2); +d = zeros(maxnoalle, nloci, npops); +prior = adjprior; +prior(find(prior==1))=0; +nollia = find(all(prior==0)); %Lokukset, joissa oli havaittu vain yht?alleelia. +prior(1,nollia)=1; +for pop1 = 1:npops + d(:,:,pop1) = (squeeze(COUNTS(:,:,pop1))+prior) ./ repmat(sum(squeeze(COUNTS(:,:,pop1))+prior),maxnoalle,1); + %dist1(pop1) = (squeeze(COUNTS(:,:,pop1))+adjprior) ./ repmat((SUMCOUNTS(pop1,:)+adjprior), maxnoalle, 1); +end +% ekarivi = blanks(7); +% for pop = 1:npops +% ekarivi = [ekarivi num2str(pop) blanks(7-floor(log10(pop)))]; +% end +ekarivi = num2str(npops); +disp(ekarivi); +if (fid ~= -1) + fprintf(fid, '%s \n', [ekarivi]); %fprintf(fid, '\n'); +end + +for pop1 = 1:npops + rivi = [blanks(2-floor(log10(pop1))) num2str(pop1) ' ']; + for pop2 = 1:pop1-1 + dist1 = d(:,:,pop1); dist2 = d(:,:,pop2); + div12 = sum(sum(dist1.*log2((dist1+10^-10) ./ (dist2+10^-10))))/nloci; + div21 = sum(sum(dist2.*log2((dist2+10^-10) ./ (dist1+10^-10))))/nloci; + div = (div12+div21)/2; + % rivi = [rivi kldiv2str(div) ' ']; + dist_mat(pop1,pop2) = div; + end +% disp(rivi); +% if (fid ~= -1) +% fprintf(fid, '%s \n', [rivi]); fprintf(fid, '\n'); +% end +end + +dist_mat = dist_mat + dist_mat'; % make it symmetric +for pop1 = 1:npops + rivi = ['Cluster_' num2str(pop1) ' ']; + for pop2 = 1:npops + rivi = [rivi kldiv2str(dist_mat(pop1,pop2)) ' ']; + end + disp(rivi); + if (fid ~= -1) + fprintf(fid, '%s \n', [rivi]); %fprintf(fid, '\n'); + end +end + +disp(' '); +disp(' '); +disp('List of sizes of 10 best visited partitions and corresponding log(ml) values'); + +if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['List of sizes of 10 best visited partitions and corresponding log(ml) values']); fprintf(fid, '\n'); +end + +partitionSummaryKaikki = partitionSummary; +partitionSummary =[]; +for i=1:size(partitionSummaryKaikki,3) + partitionSummary = [partitionSummary; partitionSummaryKaikki(:,:,i)]; +end +[I,J] = find(partitionSummaryKaikki(:,2,:)>-1e49); +partitionSummaryKaikki = partitionSummaryKaikki(I,:,:); +%keyboard + + +partitionSummary = sortrows(partitionSummary,2); +partitionSummary = partitionSummary(size(partitionSummary,1):-1:1 , :); +partitionSummary = partitionSummary(find(partitionSummary(:,2)>-1e49),:); +if size(partitionSummary,1)>10 + vikaPartitio = 10; +else + vikaPartitio = size(partitionSummary,1); +end +for part = 1:vikaPartitio + line = [num2str(partitionSummary(part,1)) ' ' num2str(partitionSummary(part,2))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end +end + +if ~fixedK + + disp(' '); + disp(' '); + disp('Probabilities for number of clusters'); + + if (fid ~= -1) + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', [' ']); fprintf(fid, '\n'); + fprintf(fid, '%s \n', ['Probabilities for number of clusters']); fprintf(fid, '\n'); + end + + npopsTaulu = unique(partitionSummary(:,1)); + len = length(npopsTaulu); + probs = zeros(len,1); + partitionSummary(:,2) = partitionSummary(:,2)-max(partitionSummary(:,2)); + sumtn = sum(exp(partitionSummary(:,2))); + for i=1:len + npopstn = sum(exp(partitionSummary(find(partitionSummary(:,1)==npopsTaulu(i)),2))); + probs(i) = npopstn / sumtn; + end + for i=1:len + if probs(i)>1e-5 + line = [num2str(npopsTaulu(i)) ' ' num2str(probs(i))]; + disp(line); + if (fid ~= -1) + fprintf(fid, '%s \n', [line]); fprintf(fid, '\n'); + end + end + end +end + +if (fid ~= -1) + fclose(fid); +else + diary off +end + +%--------------------------------------------------------------- + + +function dispLine; +disp('---------------------------------------------------'); + +%-------------------------------------------------------------------- + + +function newline = takeLine(description,width) +%Returns one line from the description: line ends to the first +%space after width:th mark. +newLine = description(1:width); +n = width+1; +while ~isspace(description(n)) & n=0 + digit = rem(num, 10^(yks+1)); + digit = floor(digit/(10^yks)); +else + digit = num*10; + digit = floor(rem(digit,10)); +end +digit = num2str(digit); + + +function mjono = kldiv2str(div) +mjono = ' '; +if abs(div)<100 + %Ei tarvita e-muotoa + mjono(6) = num2str(rem(floor(div*1000),10)); + mjono(5) = num2str(rem(floor(div*100),10)); + mjono(4) = num2str(rem(floor(div*10),10)); + mjono(3) = '.'; + mjono(2) = num2str(rem(floor(div),10)); + arvo = rem(floor(div/10),10); + if arvo>0 + mjono(1) = num2str(arvo); + end + +else + suurinYks = floor(log10(div)); + mjono(6) = num2str(suurinYks); + mjono(5) = 'e'; + mjono(4) = palautaYks(abs(div),suurinYks-1); + mjono(3) = '.'; + mjono(2) = palautaYks(abs(div),suurinYks); +end + + +%-------------------------------------------------------------------------- + + +function ninds = testaaOnkoKunnollinenBapsData(data) +%Tarkastaa onko viimeisess?sarakkeessa kaikki +%luvut 1,2,...,n johonkin n:ään asti. +%Tarkastaa lisäksi, ett?on vähintään 2 saraketta. +if size(data,1)<2 + ninds = 0; return; +end +lastCol = data(:,end); +ninds = max(lastCol); +if ~isequal((1:ninds)',unique(lastCol)) + ninds = 0; return; +end + + +%-------------------------------------------------------------------------- + + +function [ninds, data, heds] = testFastaData(inFile) +% added by Lu Cheng, 05.12.2012 +if ~exist(inFile,'file') + error('Fasta file %s does not exist!\n',inFile); +end + +[heds, seqs]=fastaread(inFile); +ninds = length(seqs); + +data = cell2mat(seqs(:)); +newData = ones(size(data))*-9; +newData(ismember(data,'Aa'))=1; +newData(ismember(data,'Cc'))=2; +newData(ismember(data,'Gg'))=3; +newData(ismember(data,'Tt'))=4; +data = [newData (1:ninds)']; + + +%-------------------------------------------------------------------------- + +function [cliques, separators, vorPoints, vorCells, pointers] ... + = handleCoords(coordinates) +%Laskee yksilöiden luonnolliset naapurit koordinaateista. +%Naapurit lasketaan lisäämäll?koordinaatteihin pisteit? +%jotta kutakin yksilöä vastaisi rajoitettu voronoi-solu +%Puuttuvat koordinaatit (negatiiviset) tulevat erakkopisteiksi +% +%Määrittää lisäksi yksilöit?vastaavat voronoi tesselaation solut. +%vorPoints:ssa on solujen kulmapisteet ja vorCells:ss?kunkin solun +%kulmapisteiden indeksit. Pointers{i} sisältää solussa i olevien yksilöiden +%indeksit. + + + +ninds = length(coordinates); +[I,J] = find(coordinates>0 | coordinates <0); %Käsitellään vain yksilöit? joilta koordinaatit +I = unique(I); %olemassa +ncoords = length(I); +puuttuvat = setdiff(1:ninds, I); +new_coordinates = addPoints(coordinates(I,:)); %Ympäröidään yksilöt apupisteill? + + +apuData = [new_coordinates(1:ncoords,:) (1:ncoords)']; +apuData = sortrows(apuData,[1 2]); +erot = [diff(apuData(:,1)) diff(apuData(:,2))]; +empties = find(erot(:,1)==0 & erot(:,2)==0); +samat = cell(length(empties),1); +pointer = 0; + +for i = 1:length(empties) + if i == 1 | empties(i) - empties(i-1) > 1 %Tutkitaan onko eri pisteess?kuin edellinen + pointer = pointer+1; + samat{pointer} = [apuData(empties(i),3) apuData(empties(i)+1,3)]; + else + samat{pointer} = [samat{pointer} apuData(empties(i)+1,3)]; + end +end + +samat = samat(1:pointer); + +erot = []; apuData = []; empties = []; + +%tri = delaunay(new_coordinates(:,1), new_coordinates(:,2), {'Qt','Qbb','Qc','Qz'}); %Apupisteiden takia ok. +tri = delaunay(new_coordinates(:,1), new_coordinates(:,2)); +%[rivi,sarake] = find(tri>ncoords); %Jätetään huomiotta apupisteet +%tri(rivi,:) = []; +pituus = tri(:,1); +pituus = length(pituus); +parit = zeros(6*pituus,2); +for i = 1:pituus %Muodostetaan kolmikoista parit + j = 6*(i-1)+1; + parit(j,:) = tri(i,1:2); + parit(j+1,:) = tri(i,1:2:3); + parit(j+2,:) = tri(i,2:3); + parit(j+3:j+5,:) = [parit(j:j+2,2) parit(j:j+2,1)]; +end +parit = unique(parit,'rows'); +[rivi,sarake] = find(parit>ncoords); %Jätetään huomiotta apupisteet +parit(rivi,:) = []; +parit = I(parit); %Otetaan poistetut takaisin mukaan +graph = sparse(parit(:,1),parit(:,2),1, ninds, ninds); + + +%Kopioidaan samassa pisteess?olevien yksilöiden naapurustot +%silt? jolle ne laitettu. + + for i = 1:length(samat); + taulu = I(samat{i}); + [rivi,sarake] = find(graph(taulu,:)>0); + if length(rivi) > 0 + kopioitava = graph(taulu(rivi(1)),:); + for j = 1:length(taulu); + graph(taulu(j),:) = kopioitava; + graph(:,taulu(j)) = kopioitava'; + end + end + end + + %Asetetaan samassa pisteess?olevat yksilöt toistensa naapureiksi + + for i = 1:length(samat) + for j = I(samat{i}) + for k = I(samat{i}) + if k ~= j + graph(j,k) = 1; + end + end + end + end + +%Laskee maksimin klikkien ja separaattorien koolle +%Määritetään myös klikit ja separaattorit + +[ncliq, nsep, cliq, sep] = laskeKlikit(graph, ninds, ninds); + +sumcliq = sum(ncliq); +sumsep = sum(nsep); +maxCliqSize = max(find(sumcliq > 0)); +maxSepSize = max(find(sumsep > 0)); + +cliques = zeros(length(cliq), maxCliqSize); +separators = zeros(length(sep), maxSepSize); + +nollia = zeros(1, length(cliq)); +for i = 1:length(cliq); + klikki = cliq{i}; + if length(klikki)>1 + cliques(i, 1:length(klikki)) = klikki; + else + nollia(i)=1; + end +end +cliques(find(nollia==1), :) = []; + +for i = 1:length(sep); + klikki = sep{i}; + separators(i, 1:length(klikki)) = klikki; +end + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%Määritetään yksilöit?vastaavat voronoi tesselaation solut + +[vorPoints, vorCells] = voronoin(new_coordinates, {'Qbb', 'Qz'}); + +bounded = ones(length(vorCells),1); +for i=1:length(vorCells) + if isempty(vorCells{i}) || length(find(vorCells{i}==1))>0 + bounded(i)=0; + end +end + + + +vorCells = vorCells(find(bounded == 1)); +pointers = cell(length(vorCells),1); +empties = zeros(1,length(vorCells)); +X = coordinates(:,1); +Y = coordinates(:,2); + +for i=1:length(pointers) + vx = vorPoints(vorCells{i},1); + vy = vorPoints(vorCells{i},2); + IN = inpolygon(X,Y,vx,vy); + if any(IN)==0 + empties(i) = 1; + else + pointers{i} = find(IN ==1)'; + end +end + +vorCells = vorCells(find(empties == 0)); +pointers = pointers(find(empties == 0)); + +%-------------------------------------------------------------------------- + +function [ncliques, nseparators, cliques, separators] = ... + laskeKlikit(M, maxCliqSize,maxSepSize) +%Laskee samankokoisten klikkien määrän verkosta M +%ncliques(i)=kokoa i olevien klikkien määr? +%nseparators vastaavasti + +ncliques=zeros(1,maxCliqSize); +nseparators=zeros(1,maxSepSize); + +if isequal(M,[]) + return; +end + +[cliques,separators]=findCliques(M); + +for i=1:length(cliques) + ncliques(length(cliques{i}))=ncliques(length(cliques{i}))+1; +end + +%cliqmax=max(find(ncliques~=0)); +%ncliques=ncliques(1:cliqmax); + +for i=1:length(separators) + nseparators(length(separators{i}))=nseparators(length(separators{i}))+1; +end + +%sepmax=max(find(nseparators~=0)); +%nseparators=nseparators(1:sepmax); + +%-------------------------------------------------------------------------- + +function C = mysetdiff(A,B) +% MYSETDIFF Set difference of two sets of positive integers (much faster than built-in setdiff) +% C = mysetdiff(A,B) +% C = A \ B = { things in A that are not in B } +% +% Original by Kevin Murphy, modified by Leon Peshkin + +if isempty(A) + C = []; + return; +elseif isempty(B) + C = A; + return; +else % both non-empty + bits = zeros(1, max(max(A), max(B))); + bits(A) = 1; + bits(B) = 0; + C = A(logical(bits(A))); +end + + +%-------------------------------------------------------------------------- + +function logml = checkLogml(priorTerm, adjprior, cliques, separators) +% tarkistaa logml:n + +global CLIQCOUNTS; +global SEPCOUNTS; +global PARTITION; + +npops = length(unique(PARTITION)); +[cliqcounts, sepcounts] = computeCounts(cliques, separators, npops); + +CLIQCOUNTS = cliqcounts; +SEPCOUNTS = sepcounts; + + +[logml, spatialPrior] = computeLogml(adjprior, priorTerm); + +disp(['logml: ' logml2String(logml) ', spatial prior: ' logml2String(spatialPrior)]); + +%-------------------------------------------------------------------------- + +function [emptyPop, pops] = findEmptyPop(npops) +% Palauttaa ensimmäisen tyhjän populaation indeksin. Jos tyhji? +% populaatioita ei ole, palauttaa -1:n. + +global PARTITION; + +pops = unique(PARTITION)'; +if (length(pops) ==npops) + emptyPop = -1; +else + popDiff = diff([0 pops npops+1]); + emptyPop = min(find(popDiff > 1)); +end + +%-------------------------------------------------------------------------- + +% function viallinen = testaaKoordinaatit(ninds, coordinates) +% % Testaa onko koordinaatit kunnollisia. +% +% viallinen = 1; +% if ~isnumeric(coordinates) +% return +% end +% +% oikeanKokoinen = (size(coordinates,1) == ninds) & (size(coordinates,2) == 2); +% if oikeanKokoinen +% viallinen = 0; +% end + +function [viallinen coordinates] = testaaKoordinaatit(ninds, coordinates) +% Testaa onko koordinaatit kunnollisia. +% modified by Lu Cheng, 05.12.2012 + +viallinen = 1; +if ~isnumeric(coordinates) + warning('Coordinates are not numerical!'); + return; +end + +oikeanKokoinen = (size(coordinates,1) == ninds) & (size(coordinates,2) == 2); +if ~oikeanKokoinen + warning('Wrong coordinates dimension!'); + return; +end + +posstr = cellfun(@(x) sprintf('%.10f',x),num2cell(coordinates),'UniformOutput',false); +posstr = cellfun(@(x) regexprep(x,'0+$',''),posstr,'UniformOutput',false); + +uni1 = unique(posstr(:,1)); +uni2 = unique(posstr(:,2)); +posstr_new = posstr; + +if length(uni1)==ninds && length(uni2)==ninds + viallinen = 0; + return; +else + ans = questdlg('Input coordinates are not unique. Do you want to make them unique?','coordinates NOT unique', 'Yes','No','Yes'); + if strcmp(ans,'No') + warning('Coordinates are not unique!'); + return; + end +end + +for i=1:length(uni1) + tmpinds = find(ismember(posstr(:,1),uni1(i))); + tmpNinds = length(tmpinds); + + if tmpNinds==1 + continue; + end + + assert(tmpNinds<100); + tmparr = round(linspace(0,99,tmpNinds)); + tmparr = tmparr(randperm(tmpNinds)); + + for j=1:tmpNinds + posstr_new{tmpinds(j),1}=sprintf('%s%02d',posstr{tmpinds(j),1},tmparr(j)); + end +end + +for i=1:length(uni2) + tmpinds = find(ismember(posstr(:,2),uni2(i))); + tmpNinds = length(tmpinds); + + if tmpNinds==1 + continue; + end + + assert(tmpNinds<100); + tmparr = round(linspace(0,99,tmpNinds)); + tmparr = tmparr(randperm(tmpNinds)); + + for j=1:tmpNinds + posstr_new{tmpinds(j),2}=sprintf('%s%02d',posstr{tmpinds(j),2},tmparr(j)); + end +end + +coordinates = cellfun(@str2double,posstr_new); +uni1 = unique(coordinates(:,1)); +uni2 = unique(coordinates(:,2)); +if length(uni1)==ninds && length(uni2)==ninds + viallinen = 0; +else + warning('Can not make coordinates unique!'); +end + +%-------------------------------------------------------------------------- + + +function [sumcounts, counts] = ... + initialCounts(partition, data, npops, noalle) + +nloci=size(data,2); +% ninds = size(rows, 1); + +%koot = rows(:,1) - rows(:,2) + 1; +%maxSize = max(koot); + +counts = zeros(max(noalle),nloci,npops); +sumcounts = zeros(npops,nloci); +for i=1:npops + for j=1:nloci + havainnotLokuksessa = find(partition==i & data(:,j)>=0); + sumcounts(i,j) = length(havainnotLokuksessa); + for k=1:noalle(j) + alleleCode = k; + N_ijk = length(find(data(havainnotLokuksessa,j)==alleleCode)); + counts(k,j,i) = N_ijk; + end + end +end + + +%-------------------------------------------------------------------------- + +function [popnames2, rowsFromInd] = findOutRowsFromInd(popnames, rows) + +ploidisuus = questdlg('Specify the type of individuals in the data: ',... + 'Individual type?', 'Haploid', 'Diploid', 'Tetraploid', ... + 'Diploid'); + +switch ploidisuus +case 'Haploid' + rowsFromInd = 1; +case 'Diploid' + rowsFromInd = 2; +case 'Tetraploid' + rowsFromInd = 4; +end + +if ~isempty(popnames) + for i = 1:size(rows,1) + popnames2{i,1} = popnames{i,1}; + rivi = rows(i,1):rows(i,2); + popnames2{i,2} = (rivi(rowsFromInd))/rowsFromInd; + end +else + popnames2 = []; +end + +%-------------------------------------------------------------------------- + +function fiksaaPartitioYksiloTasolle(rows, rowsFromInd) + +global PARTITION; +totalRows = 0; +for ind = 1:size(rows,1) + totalRows = totalRows + (rows(ind,2)-rows(ind,1)+1); +end +partitio2 = zeros(totalRows/rowsFromInd,1); + +for ind = 1:size(rows,1) + kaikkiRivit = rows(ind,1):rows(ind,2); + for riviNumero = rowsFromInd:rowsFromInd:length(kaikkiRivit) + %for riviNumero = rowsFromInd:rowsFromInd:length(rows{ind}) + %rivi = rows{ind}(riviNumero); + rivi = kaikkiRivit(riviNumero); + partitio2(rivi/rowsFromInd) = PARTITION(ind); + end +end +PARTITION = partitio2;