<?xml version="1.0" encoding="UTF-8"?>

<rdf:RDF
   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
   xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
   xmlns="http://purl.org/rss/1.0/"
   xmlns:dc="http://purl.org/dc/elements/1.1/"
   xmlns:prism="http://prismstandard.org/namespaces/1.2/basic/"
   xmlns:dcterms="http://purl.org/dc/terms/"

>
<channel rdf:about="http://www.citeulike.org/about">
<pubDate>Thu, 21 Aug 2008 14:14:51 BST</pubDate>


	<title>CiteULike: stajich's methods</title>
	<description>CiteULike: stajich's methods</description>


	<link>http://www.citeulike.org/user/stajich/tag/methods</link>
	<dc:publisher>CiteULike.org</dc:publisher>
	<dc:language>en-gb</dc:language>
	<dc:rights>Copyright &#169; 2004-2008 citeulike.org</dc:rights>
	<items>
    <rdf:Seq>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/1220969"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/3063774"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2945159"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/3042950"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2824834"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2994231"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2996042"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/3006093"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2961447"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2968186"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2966909"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2894296"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2890233"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2890226"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2883810"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2800017"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2800014"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2768977"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2781969"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2739713"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2640090"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2410420"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2341756"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/967505"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/236100"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/408246"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2607945"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2600818"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2534847"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2230351"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2060806"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/2017324"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/1928496"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/1939394"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/1702751"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/1853098"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/614084"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/1341319"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/177005"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/1115595"/>
        <rdf:li rdf:resource="http://www.citeulike.org/user/stajich/article/1269642"/>

	</rdf:Seq>
	</items>
	</channel>


<item rdf:about="http://www.citeulike.org/user/stajich/article/1220969">
    <title>BranchClust: A phylogenetic algorithm for selecting gene families</title>
    <link>http://www.citeulike.org/user/stajich/article/1220969</link>
    <description>&lt;i&gt;BMC Bioinformatics, Vol. 8 (10 April 2007), 120.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Background Automated methods for assembling families of orthologous genes include those based on sequence similarity scores and those based on phylogenetic approaches. The first are easy to automate but usually they do not distinguish between paralogs and orthologs or have restriction on the number of taxa. Phylogenetic methods often are based on reconciliation of a gene tree with a known rooted species tree; a limitation of this approach, especially in case of prokaryotes, is that the species tree is often unknown, and that from the analyses of single gene families the branching order between related organisms frequently is unresolved. Results Here we describe an algorithm for the automated selection of orthologous genes that recognizes orthologous genes from different species in a phylogenetic tree for any number of taxa. The algorithm is capable of distinguishing complete (containing all taxa) and incomplete (not containing all taxa) families and recognizes in- and outparalogs. The BranchClust algorithm is implemented in Perl with the use of the BioPerl module for parsing trees and is freely available at http://bioinformatics.org/branchclust webcite. Conclusion BranchClust outperforms the Reciprocal Best Blast hit method in selecting more sets of putatively orthologous genes. In the test cases examined, the correctness of the selected families and of the identified in- and outparalogs was confirmed by inspection of the pertinent phylogenetic trees.</description>
    <dc:title>BranchClust: A phylogenetic algorithm for selecting gene families</dc:title>

    <dc:creator>Maria Poptsova</dc:creator>
    <dc:creator>Peter Gogarten</dc:creator>
    <dc:identifier>doi:10.1186/1471-2105-8-120</dc:identifier>
    <dc:source>BMC Bioinformatics, Vol. 8 (10 April 2007), 120.</dc:source>
    <dc:date>2007-04-11T16:12:07-00:00</dc:date>
    <prism:publicationYear>2007</prism:publicationYear>
    <prism:publicationName>BMC Bioinformatics</prism:publicationName>
    <prism:issn>1471-2105</prism:issn>
    <prism:volume>8</prism:volume>
    <prism:startingPage>120</prism:startingPage>
    <prism:category>bioinformatics</prism:category>
    <prism:category>gene_tree</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>orthology</prism:category>
    <prism:category>paralogs</prism:category>
    <prism:category>phylogenetics</prism:category>
    <prism:category>phylogenomics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/3063774">
    <title>Memory-efficient dynamic programming backtrace and pairwise local sequence alignment.</title>
    <link>http://www.citeulike.org/user/stajich/article/3063774</link>
    <description>&lt;i&gt;Bioinformatics (Oxford, England) (17 June 2008)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;MOTIVATION: A backtrace through a dynamic programming algorithm's intermediate results in search of an optimal path, or to sample paths according to an implied probability distribution, or as the second stage of a forward-backward algorithm, is a task of fundamental importance in computational biology. When there is insufficient space to store all intermediate results in high-speed memory (e.g., cache) existing approaches store selected stages of the computation, and recompute missing values from these checkpoints on an as-needed basis. RESULTS: Here we present an optimal checkpointing strategy, and demonstrate its utility with pairwise local sequence alignment of sequences of length 10,000. AVAILABILITY: Sample C++-code for optimal backtrace is available in the Supplementary Materials. CONTACT: leen@cs.rpi.edu.</description>
    <dc:title>Memory-efficient dynamic programming backtrace and pairwise local sequence alignment.</dc:title>

    <dc:creator>Lee A Newberg</dc:creator>
    <dc:source>Bioinformatics (Oxford, England) (17 June 2008)</dc:source>
    <dc:date>2008-07-30T16:47:20-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Bioinformatics (Oxford, England)</prism:publicationName>
    <prism:issn>1460-2059</prism:issn>
    <prism:category>alignment</prism:category>
    <prism:category>bioinformatics</prism:category>
    <prism:category>computational_biology</prism:category>
    <prism:category>methods</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2945159">
    <title>SCUMBLE: a method for systematic and accurate detection of codon usage bias by maximum likelihood estimation</title>
    <link>http://www.citeulike.org/user/stajich/article/2945159</link>
    <description>&lt;i&gt;Nucl. Acids Res., Vol. 36, No. 11. (1 June 2008), pp. 3819-3827.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The genetic code is degenerate--most amino acids can be encoded by from two to as many as six different codons. The synonymous codons are not used with equal frequency: not only are some codons favored over others, but also their usage can vary significantly from species to species and between different genes in the same organism. Known causes of codon bias include differences in mutation rates as well as selection pressure related to the expression level of a gene, but the standard analysis methods can account for only a fraction of the observed codon usage variation. We here introduce an explicit model of codon usage bias, inspired by statistical physics. Combining this model with a maximum likelihood approach, we are able to clearly identify different sources of bias in various genomes. We have applied the algorithm to Saccharomyces cerevisiae as well as 325 prokaryote genomes, and in most cases our model explains essentially all observed variance. 10.1093/nar/gkn288</description>
    <dc:title>SCUMBLE: a method for systematic and accurate detection of codon usage bias by maximum likelihood estimation</dc:title>

    <dc:creator>Morten Kloster</dc:creator>
    <dc:creator>Chao Tang</dc:creator>
    <dc:identifier>doi:10.1093/nar/gkn288</dc:identifier>
    <dc:source>Nucl. Acids Res., Vol. 36, No. 11. (1 June 2008), pp. 3819-3827.</dc:source>
    <dc:date>2008-06-30T15:02:22-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Nucl. Acids Res.</prism:publicationName>
    <prism:volume>36</prism:volume>
    <prism:number>11</prism:number>
    <prism:startingPage>3819</prism:startingPage>
    <prism:endingPage>3827</prism:endingPage>
    <prism:category>bioinformatics</prism:category>
    <prism:category>codon_bias</prism:category>
    <prism:category>methods</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/3042950">
    <title>GBParsy: A GenBank flatfile parser library with high speed</title>
    <link>http://www.citeulike.org/user/stajich/article/3042950</link>
    <description>&lt;i&gt;BMC Bioinformatics, Vol. 9, No. 1. (2008)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;BACKGROUND:GenBank flatfile (GBF) format is one of the most popular sequence file formats because of its detailed sequence features and ease of readability. To use the data in the file by a computer, a parsing process is required and is performed according to a given grammar for the sequence and the description in a GBF. Currently, several parser libraries for the GBF have been developed. However, with the accumulation of DNA sequence information from eukaryotic chromosomes, parsing a eukaryotic genome sequence with these libraries inevitably takes a long time, due to the large GBF file and its correspondingly large genomic nucleotide sequence and related feature information. Thus, there is significant need to develop a parsing program with high speed and efficient use of system memory.RESULTS:We developed a library, GBParsy, which was C language-based and parses GBF files. The parsing speed was maximized by using content-specified functions in place of regular expressions that are flexible but slow. In addition, we optimized an algorithm related to memory usage so that it also increased parsing performance and efficiency of memory usage. GBParsy is at least 5 - 100X faster than current parsers in benchmark tests.CONCLUSIONS:GBParsy is estimated to extract annotated information from almost 100 Mb of a GenBank flatfile for chromosomal sequence information within a second. Thus, it should be used for a variety of applications such as on-time visualization of a genome at a web site.</description>
    <dc:title>GBParsy: A GenBank flatfile parser library with high speed</dc:title>

    <dc:creator>Tae Lee</dc:creator>
    <dc:creator>Yeon Kim</dc:creator>
    <dc:creator>Baek Nahm</dc:creator>
    <dc:identifier>doi:10.1186/1471-2105-9-321</dc:identifier>
    <dc:source>BMC Bioinformatics, Vol. 9, No. 1. (2008)</dc:source>
    <dc:date>2008-07-25T16:03:26-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>BMC Bioinformatics</prism:publicationName>
    <prism:volume>9</prism:volume>
    <prism:number>1</prism:number>
    <prism:category>bioinformatics</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>parser</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2824834">
    <title>Trait-trait dynamic interaction: 2D-trait eQTL mapping for genetic variation study</title>
    <link>http://www.citeulike.org/user/stajich/article/2824834</link>
    <description>&lt;i&gt;BMC Genomics, Vol. 9, No. 1. (2008)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;BACKGROUND:Many studies have shown that the abundance level of gene expression is heritable. Analogous to the traditional genetic study, most researchers treat the expression of one gene as a quantitative trait and map it to expression quantitative trait loci (eQTL). This is 1D-trait mapping. 1D-trait mapping ignores the trait-trait interaction completely, which is a major shortcoming. RESULTS:To overcome this limitation, we study the expression of a pair of genes and treat the variation in their co-expression pattern as a two dimensional quantitative trait. We develop a method to find gene pairs, whose co-expression patterns, including both signs and strengths, are mediated by genetic variations and map these 2D-traits to the corresponding genetic loci. We report several applications by combining 1D-trait mapping with 2D-trait mapping, including the contribution of genetic variations to the perturbations in the regulatory mechanisms of yeast metabolic pathways.CONCLUSIONS:Our approach of 2D-trait mapping provides a novel and effective way to connect genetic variations with higher order biological modules via gene expression profiles.</description>
    <dc:title>Trait-trait dynamic interaction: 2D-trait eQTL mapping for genetic variation study</dc:title>

    <dc:creator>Wei Sun</dc:creator>
    <dc:creator>Shinsheng Yuan</dc:creator>
    <dc:creator>Ker Li</dc:creator>
    <dc:identifier>doi:10.1186/1471-2164-9-242</dc:identifier>
    <dc:source>BMC Genomics, Vol. 9, No. 1. (2008)</dc:source>
    <dc:date>2008-05-23T09:50:07-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>BMC Genomics</prism:publicationName>
    <prism:volume>9</prism:volume>
    <prism:number>1</prism:number>
    <prism:category>eqtl</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>qtl</prism:category>
    <prism:category>quantitative_genetics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2994231">
    <title>Unraveling Protein Networks with Power Graph Analysis</title>
    <link>http://www.citeulike.org/user/stajich/article/2994231</link>
    <description>&lt;i&gt;PLoS Comput Biol, Vol. 4, No. 7. (11 July 2008), e1000108.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Networks play a crucial role in computational biology, yet their analysis and representation is still an open problem. Power Graph Analysis is a lossless transformation of biological networks into a compact, less redundant representation, exploiting the abundance of cliques and bicliques as elementary topological motifs. We demonstrate with five examples the advantages of Power Graph Analysis. Investigating protein-protein interaction networks, we show how the catalytic subunits of the casein kinase II complex are distinguishable from the regulatory subunits, how interaction profiles and sequence phylogeny of SH3 domains correlate, and how false positive interactions among high-throughput interactions are spotted. Additionally, we demonstrate the generality of Power Graph Analysis by applying it to two other types of networks. We show how power graphs induce a clustering of both transcription factors and target genes in bipartite transcription networks, and how the erosion of a phosphatase domain in type 22 non-receptor tyrosine phosphatases is detected. We apply Power Graph Analysis to high-throughput protein interaction networks and show that up to 85% (56% on average) of the information is redundant. Experimental networks are more compressible than rewired ones of same degree distribution, indicating that experimental networks are rich in cliques and bicliques. Power Graphs are a novel representation of networks, which reduces network complexity by explicitly representing re-occurring network motifs. Power Graphs compress up to 85% of the edges in protein interaction networks and are applicable to all types of networks such as protein interactions, regulatory networks, or homology networks.</description>
    <dc:title>Unraveling Protein Networks with Power Graph Analysis</dc:title>

    <dc:creator>Loïc Royer</dc:creator>
    <dc:creator>Matthias Reimann</dc:creator>
    <dc:creator>Bill Andreopoulos</dc:creator>
    <dc:creator>Michael Schroeder</dc:creator>
    <dc:identifier>doi:10.1371/journal.pcbi.1000108</dc:identifier>
    <dc:source>PLoS Comput Biol, Vol. 4, No. 7. (11 July 2008), e1000108.</dc:source>
    <dc:date>2008-07-11T23:03:28-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>PLoS Comput Biol</prism:publicationName>
    <prism:volume>4</prism:volume>
    <prism:number>7</prism:number>
    <prism:startingPage>e1000108</prism:startingPage>
    <prism:publisher>Public Library of Science</prism:publisher>
    <prism:category>bioinformatics</prism:category>
    <prism:category>graphics</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>protein</prism:category>
    <prism:category>protein_interactions</prism:category>
    <prism:category>visualization</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2996042">
    <title>Conditional Variable Importance for Random Forests</title>
    <link>http://www.citeulike.org/user/stajich/article/2996042</link>
    <description>&lt;i&gt;BMC Bioinformatics, Vol. 9 (11 July 2008), 307.&lt;/i&gt;</description>
    <dc:title>Conditional Variable Importance for Random Forests</dc:title>

    <dc:creator>Carolin Strobl</dc:creator>
    <dc:creator>Anne-Laure Boulesteix</dc:creator>
    <dc:creator>Thomas Kneib</dc:creator>
    <dc:creator>Thomas Augustin</dc:creator>
    <dc:creator>Achim Zeileis</dc:creator>
    <dc:identifier>doi:10.1186/1471-2105-9-307</dc:identifier>
    <dc:source>BMC Bioinformatics, Vol. 9 (11 July 2008), 307.</dc:source>
    <dc:date>2008-07-12T14:50:00-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>BMC Bioinformatics</prism:publicationName>
    <prism:issn>1471-2105</prism:issn>
    <prism:volume>9</prism:volume>
    <prism:startingPage>307</prism:startingPage>
    <prism:category>bioinformatics</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>randomforest</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/3006093">
    <title>How Well Does the HoT Score Reflect Sequence Alignment Accuracy?</title>
    <link>http://www.citeulike.org/user/stajich/article/3006093</link>
    <description>&lt;i&gt;Mol Biol Evol, Vol. 25, No. 8. (1 August 2008), pp. 1576-1580.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Multiple sequence alignment is an essential tool in many areas of biological research, and the accuracy of an alignment can strongly affect the accuracy of a downstream application such as phylogenetic analysis, identification of functional motifs, or polymerase chain reaction primer design. The heads or tails (HoT) method (Landan G, Graur D. 2007. Heads or tails: a simple reliability check for multiple sequence alignments. Mol Biol Evol. 24:1380-1383.) assesses the consistency of an alignment by comparing the alignment of a set of sequences with the alignment of the same set of sequences written in reverse order. This study shows that HoT scores and the alignment accuracies are positively correlated, so alignments with higher HoT scores are preferable. However, HoT scores are overestimates of alignment accuracy in general, with the extent of overestimation depending on the method used for multiple sequence alignment. 10.1093/molbev/msn103</description>
    <dc:title>How Well Does the HoT Score Reflect Sequence Alignment Accuracy?</dc:title>

    <dc:creator>Barry Hall</dc:creator>
    <dc:identifier>doi:10.1093/molbev/msn103</dc:identifier>
    <dc:source>Mol Biol Evol, Vol. 25, No. 8. (1 August 2008), pp. 1576-1580.</dc:source>
    <dc:date>2008-07-15T15:36:04-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Mol Biol Evol</prism:publicationName>
    <prism:volume>25</prism:volume>
    <prism:number>8</prism:number>
    <prism:startingPage>1576</prism:startingPage>
    <prism:endingPage>1580</prism:endingPage>
    <prism:category>alignment</prism:category>
    <prism:category>bioinformatics</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>molecular_evolution</prism:category>
    <prism:category>statistics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2961447">
    <title>Phylogenetic Signal in the Eukaryotic Tree of Life</title>
    <link>http://www.citeulike.org/user/stajich/article/2961447</link>
    <description>&lt;i&gt;Science, Vol. 321, No. 5885. (4 July 2008), pp. 121-123.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Molecular sequence data have been sampled from 10% of all species known to science. Although it is not yet feasible to assemble these data into a single phylogenetic tree of life, it is possible to quantify how much phylogenetic signal is present. Analysis of 14,289 phylogenies built from 2.6 million sequences in GenBank suggests that signal is strong in vertebrates and specific groups of nonvertebrate model organisms. Across eukaryotes, however, although phylogenetic evidence is very broadly distributed, for the average species in the database it is equivalent to less than one well-supported gene tree. This analysis shows that a stronger sampling effort aimed at genomic depth, in addition to taxonomic breadth, will be required to build high-resolution phylogenetic trees at this scale. 10.1126/science.1154449</description>
    <dc:title>Phylogenetic Signal in the Eukaryotic Tree of Life</dc:title>

    <dc:creator>Michael Sanderson</dc:creator>
    <dc:identifier>doi:10.1126/science.1154449</dc:identifier>
    <dc:source>Science, Vol. 321, No. 5885. (4 July 2008), pp. 121-123.</dc:source>
    <dc:date>2008-07-03T22:12:58-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Science</prism:publicationName>
    <prism:volume>321</prism:volume>
    <prism:number>5885</prism:number>
    <prism:startingPage>121</prism:startingPage>
    <prism:endingPage>123</prism:endingPage>
    <prism:category>methods</prism:category>
    <prism:category>phylogenetic_marker_selection</prism:category>
    <prism:category>phylogenetics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2968186">
    <title>The multiple gene duplication problem revisited.</title>
    <link>http://www.citeulike.org/user/stajich/article/2968186</link>
    <description>&lt;i&gt;Bioinformatics (Oxford, England), Vol. 24, No. 13. (1 July 2008)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;MOTIVATION: Deciphering the location of gene duplications and multiple gene duplication episodes on the Tree of Life is fundamental to understanding the way gene families and genomes evolve. The multiple gene duplication problem provides a framework for placing gene duplication events onto nodes of a given species tree, and detecting episodes of multiple gene duplication. One version of the multiple gene duplication problem was defined by Guigó et al. in 1996. Several heuristic solutions have since been proposed for this problem, but no exact algorithms were known. RESULTS: In this article we solve this longstanding open problem by providing the first exact and efficient solution. We also demonstrate the improvement offered by our algorithm over the best heuristic approaches, by applying it to several simulated as well as empirical datasets.</description>
    <dc:title>The multiple gene duplication problem revisited.</dc:title>

    <dc:creator>MS Bansal</dc:creator>
    <dc:creator>O Eulenstein</dc:creator>
    <dc:source>Bioinformatics (Oxford, England), Vol. 24, No. 13. (1 July 2008)</dc:source>
    <dc:date>2008-07-07T02:37:12-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Bioinformatics (Oxford, England)</prism:publicationName>
    <prism:issn>1460-2059</prism:issn>
    <prism:volume>24</prism:volume>
    <prism:number>13</prism:number>
    <prism:category>bioinformatics</prism:category>
    <prism:category>gene_duplication</prism:category>
    <prism:category>genome</prism:category>
    <prism:category>methods</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2966909">
    <title>A max-margin model for efficient simultaneous alignment and folding of RNA sequences.</title>
    <link>http://www.citeulike.org/user/stajich/article/2966909</link>
    <description>&lt;i&gt;Bioinformatics (Oxford, England), Vol. 24, No. 13. (1 July 2008)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;MOTIVATION: The need for accurate and efficient tools for computational RNA structure analysis has become increasingly apparent over the last several years: RNA folding algorithms underlie numerous applications in bioinformatics, ranging from microarray probe selection to de novo non-coding RNA gene prediction. In this work, we present RAF (RNA Alignment and Folding), an efficient algorithm for simultaneous alignment and consensus folding of unaligned RNA sequences. Algorithmically, RAF exploits sparsity in the set of likely pairing and alignment candidates for each nucleotide (as identified by the CONTRAfold or CONTRAlign programs) to achieve an effectively quadratic running time for simultaneous pairwise alignment and folding. RAF's fast sparse dynamic programming, in turn, serves as the inference engine within a discriminative machine learning algorithm for parameter estimation. RESULTS: In cross-validated benchmark tests, RAF achieves accuracies equaling or surpassing the current best approaches for RNA multiple sequence secondary structure prediction. However, RAF requires nearly an order of magnitude less time than other simultaneous folding and alignment methods, thus making it especially appropriate for high-throughput studies. AVAILABILITY: Source code for RAF is available at:http://contra.stanford.edu/contrafold/.</description>
    <dc:title>A max-margin model for efficient simultaneous alignment and folding of RNA sequences.</dc:title>

    <dc:creator>CB Do</dc:creator>
    <dc:creator>CS Foo</dc:creator>
    <dc:creator>S Batzoglou</dc:creator>
    <dc:source>Bioinformatics (Oxford, England), Vol. 24, No. 13. (1 July 2008)</dc:source>
    <dc:date>2008-07-06T06:33:22-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Bioinformatics (Oxford, England)</prism:publicationName>
    <prism:issn>1460-2059</prism:issn>
    <prism:volume>24</prism:volume>
    <prism:number>13</prism:number>
    <prism:category>alignment</prism:category>
    <prism:category>bioinformatics</prism:category>
    <prism:category>folding</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>rna</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2894296">
    <title>An Improved General Amino Acid Replacement Matrix</title>
    <link>http://www.citeulike.org/user/stajich/article/2894296</link>
    <description>&lt;i&gt;Mol Biol Evol, Vol. 25, No. 7. (1 July 2008), pp. 1307-1320.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Amino acid replacement matrices are an essential basis of protein phylogenetics. They are used to compute substitution probabilities along phylogeny branches and thus the likelihood of the data. They are also essential in protein alignment. A number of replacement matrices and methods to estimate these matrices from protein alignments have been proposed since the seminal work of Dayhoff et al. (1972). An important advance was achieved by Whelan and Goldman (2001) and their WAG matrix, thanks to an efficient maximum likelihood estimation approach that accounts for the phylogenies of sequences within each training alignment. We further refine this method by incorporating the variability of evolutionary rates across sites in the matrix estimation and using a much larger and diverse database than BRKALN, which was used to estimate WAG. To estimate our new matrix (called LG after the authors), we use an adaptation of the XRATE software and 3,912 alignments from Pfam, comprising [~]50,000 sequences and [~]6.5 million residues overall. To evaluate the LG performance, we use an independent sample consisting of 59 alignments from TreeBase and randomly divide Pfam alignments into 3,412 training and 500 test alignments. The comparison with WAG and JTT shows a clear likelihood improvement. With TreeBase, we find that 1) the average Akaike information criterion gain per site is 0.25 and 0.42, when compared with WAG and JTT, respectively; 2) LG is significantly better than WAG for 38 alignments (among 59), and significantly worse with 2 alignments only; and 3) tree topologies inferred with LG, WAG, and JTT frequently differ, indicating that using LG impacts not only the likelihood value but also the output tree. Results with the test alignments from Pfam are analogous. LG and a PHYML implementation can be downloaded from http://atgc.lirmm.fr/LG. 10.1093/molbev/msn067</description>
    <dc:title>An Improved General Amino Acid Replacement Matrix</dc:title>

    <dc:creator>Si Le</dc:creator>
    <dc:creator>Olivier Gascuel</dc:creator>
    <dc:identifier>doi:10.1093/molbev/msn067</dc:identifier>
    <dc:source>Mol Biol Evol, Vol. 25, No. 7. (1 July 2008), pp. 1307-1320.</dc:source>
    <dc:date>2008-06-14T06:07:27-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Mol Biol Evol</prism:publicationName>
    <prism:volume>25</prism:volume>
    <prism:number>7</prism:number>
    <prism:startingPage>1307</prism:startingPage>
    <prism:endingPage>1320</prism:endingPage>
    <prism:category>bioinformatics</prism:category>
    <prism:category>evolution</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>molecular_evolution</prism:category>
    <prism:category>phylogenetics</prism:category>
    <prism:category>protein_evolution</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2890233">
    <title>Endogenous siRNA and miRNA targets identified by sequencing of the Arabidopsis degradome.</title>
    <link>http://www.citeulike.org/user/stajich/article/2890233</link>
    <description>&lt;i&gt;Current biology : CB, Vol. 18, No. 10. (20 May 2008), pp. 758-762.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;MicroRNAs (miRNAs) regulate the expression of target mRNAs in plants and animals [1]. Plant miRNA targets have been predicted on the basis of their extensive and often conserved complementarity to the miRNAs [2-4], as well as on miRNA overexpression experiments [5]; many of these target predictions have been confirmed by isolation of the products of miRNA-directed cleavage. Here, we present a transcriptome-wide experimental method, called &#34;degradome sequencing,&#34; to directly detect cleaved miRNA targets without relying on predictions or overexpression. The 5' ends of polyadenylated, uncapped mRNAs from Arabidopsis were directly sampled, resulting in an empirical snapshot of the degradome. miRNA-mediated-cleavage products were easily discerned from an extensive background of degraded mRNAs, which collectively covered the majority of the annotated transcriptome. Many previously known Arabidopsis miRNA targets were confirmed, and several novel targets were also discovered. Quantification of cleavage fragments revealed that those derived from TAS transcripts, which are unusual in their production of abundant secondary small interfering RNAs (siRNAs), accumulated to very high levels. A subset of secondary siRNAs are also known to direct cleavage of targets in trans[6]; degradome sequencing revealed many cleaved targets of these trans-acting siRNAs (ta-siRNAs). This empirical method is broadly applicable to the discovery and quantification of cleaved targets of small RNAs without a priori predictions.</description>
    <dc:title>Endogenous siRNA and miRNA targets identified by sequencing of the Arabidopsis degradome.</dc:title>

    <dc:creator>C Addo-Quaye</dc:creator>
    <dc:creator>TW Eshoo</dc:creator>
    <dc:creator>DP Bartel</dc:creator>
    <dc:creator>MJ Axtell</dc:creator>
    <dc:identifier>doi:10.1016/j.cub.2008.04.042</dc:identifier>
    <dc:source>Current biology : CB, Vol. 18, No. 10. (20 May 2008), pp. 758-762.</dc:source>
    <dc:date>2008-06-13T00:19:07-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Current biology : CB</prism:publicationName>
    <prism:issn>0960-9822</prism:issn>
    <prism:volume>18</prism:volume>
    <prism:number>10</prism:number>
    <prism:startingPage>758</prism:startingPage>
    <prism:endingPage>762</prism:endingPage>
    <prism:category>bioinformatics</prism:category>
    <prism:category>illumina</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>mirna</prism:category>
    <prism:category>sequencing</prism:category>
    <prism:category>sirna</prism:category>
    <prism:category>smallrna</prism:category>
    <prism:category>solexa</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2890226">
    <title>Global identification of microRNA-target RNA pairs by parallel analysis of RNA ends.</title>
    <link>http://www.citeulike.org/user/stajich/article/2890226</link>
    <description>&lt;i&gt;Nature biotechnology (9 June 2008)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;MicroRNAs (miRNAs) are important regulatory molecules in most eukaryotes and identification of their target mRNAs is essential for their functional analysis. Whereas conventional methods rely on computational prediction and subsequent experimental validation of target RNAs, we directly sequenced &#62;28,000,000 signatures from the 5' ends of polyadenylated products of miRNA-mediated mRNA decay, isolated from inflorescence tissue of Arabidopsis thaliana, to discover novel miRNA-target RNA pairs. Within the set of approximately 27,000 transcripts included in the 8,000,000 nonredundant signatures, several previously predicted but nonvalidated targets of miRNAs were found. Like validated targets, most showed a single abundant signature at the miRNA cleavage site, particularly in libraries from a mutant deficient in the 5'-to-3' exonuclease AtXRN4. Although miRNAs in Arabidopsis have been extensively investigated, working in reverse from the cleaved targets resulted in the identification and validation of novel miRNAs. This versatile approach will affect the study of other aspects of RNA processing beyond miRNA-target RNA pairs.</description>
    <dc:title>Global identification of microRNA-target RNA pairs by parallel analysis of RNA ends.</dc:title>

    <dc:creator>Marcelo A German</dc:creator>
    <dc:creator>Manoj Pillay</dc:creator>
    <dc:creator>Dong-Hoon Jeong</dc:creator>
    <dc:creator>Amit Hetawal</dc:creator>
    <dc:creator>Shujun Luo</dc:creator>
    <dc:creator>Prakash Janardhanan</dc:creator>
    <dc:creator>Vimal Kannan</dc:creator>
    <dc:creator>Linda A Rymarquis</dc:creator>
    <dc:creator>Kan Nobuta</dc:creator>
    <dc:creator>Rana German</dc:creator>
    <dc:creator>Emanuele De Paoli</dc:creator>
    <dc:creator>Cheng Lu</dc:creator>
    <dc:creator>Gary Schroth</dc:creator>
    <dc:creator>Blake C Meyers</dc:creator>
    <dc:creator>Pamela J Green</dc:creator>
    <dc:identifier>doi:10.1038/nbt1417</dc:identifier>
    <dc:source>Nature biotechnology (9 June 2008)</dc:source>
    <dc:date>2008-06-13T00:11:02-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Nature biotechnology</prism:publicationName>
    <prism:issn>1546-1696</prism:issn>
    <prism:category>bioinformatics</prism:category>
    <prism:category>htg</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>mirna</prism:category>
    <prism:category>sequencing</prism:category>
    <prism:category>smallrna</prism:category>
    <prism:category>solexa</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2883810">
    <title>RNA-seq: An assessment of technical reproducibility and comparison with gene expression arrays</title>
    <link>http://www.citeulike.org/user/stajich/article/2883810</link>
    <description>&lt;i&gt;Genome Res. (11 June 2008), gr.079558.108.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Ultra high-throughput sequencing is emerging as an attractive alternative to microarrays for genotyping, analysis of methylation patterns and identification of transcription factor binding sites. Here, we describe an application of the Illumina sequencing platform to study mRNA expression levels. Our goals were to estimate technical variance associated with Illumina sequencing in this context and to compare its ability to identify differentially expressed genes with existing array technologies. To do so, we estimated gene expression differences between liver and kidney RNA samples using multiple sequencing replicates, and compared the sequencing data to results obtained from Affymetrix arrays using the same RNA samples. We find that the Illumina sequencing data are highly replicable, with relatively little technical variation, and so, for many purposes, it may suffice to sequence each mRNA sample only once (i.e., using one lane). The information in a single lane of Illumina sequencing data appears comparable to that in a single array in enabling identification of differentially expressed genes, while allowing for additional analyses such as detection of low-expressed genes, alternative splice variants, and novel transcripts. Based on our observations, we propose an empirical protocol and a statistical framework for the analysis of gene expression using ultra high-throughput sequencing technology. 10.1101/gr.079558.108</description>
    <dc:title>RNA-seq: An assessment of technical reproducibility and comparison with gene expression arrays</dc:title>

    <dc:creator>John Marioni</dc:creator>
    <dc:creator>Cristopher Mason</dc:creator>
    <dc:creator>Shrikant Mane</dc:creator>
    <dc:creator>Matthew Stephens</dc:creator>
    <dc:creator>Yoav Gilad</dc:creator>
    <dc:identifier>doi:10.1101/gr.079558.108</dc:identifier>
    <dc:source>Genome Res. (11 June 2008), gr.079558.108.</dc:source>
    <dc:date>2008-06-11T20:56:53-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Genome Res.</prism:publicationName>
    <prism:startingPage>gr.079558.108</prism:startingPage>
    <prism:category>bioinformatics</prism:category>
    <prism:category>gene_expression</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>rnaseq</prism:category>
    <prism:category>sequencing</prism:category>
    <prism:category>solexa</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2800017">
    <title>Malin: maximum likelihood analysis of intron evolution in eukaryotes.</title>
    <link>http://www.citeulike.org/user/stajich/article/2800017</link>
    <description>&lt;i&gt;Bioinformatics (Oxford, England) (12 May 2008)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;SUMMARY: Malin is a software package for the analysis of eukaryotic gene structure evolution. It provides a graphical user interface for various tasks commonly used to infer the evolution of exon-intron structure in protein-coding orthologs. Implemented tasks include the identification of conserved homologous intron sites in protein alignments, as well as the estimation of ancestral intron content, lineage-specific intron losses and gains. Estimates are computed either with parsimony, or with a probabilistic model that incorporates rate variation across lineages and intron sites. AVAILABILITY: Availability: Malin is available as a stand-alone Java application, as well as an application bundle for MacOS X, at the website http://www.iro.umontreal.ca/~csuros/introns/malin/. The software is distributed under a BSD-style license. CONTACT: csuros@iro.umontreal.ca.</description>
    <dc:title>Malin: maximum likelihood analysis of intron evolution in eukaryotes.</dc:title>

    <dc:creator>Miklós Csur Ös</dc:creator>
    <dc:source>Bioinformatics (Oxford, England) (12 May 2008)</dc:source>
    <dc:date>2008-05-14T21:09:39-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Bioinformatics (Oxford, England)</prism:publicationName>
    <prism:issn>1460-2059</prism:issn>
    <prism:category>intron_evolution</prism:category>
    <prism:category>intron_gain</prism:category>
    <prism:category>intron_loss</prism:category>
    <prism:category>methods</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2800014">
    <title>DupTree: A program for large-scale phylogenetic analyses using gene tree parsimony.</title>
    <link>http://www.citeulike.org/user/stajich/article/2800014</link>
    <description>&lt;i&gt;Bioinformatics (Oxford, England) (12 May 2008)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;SUMMARY: DupTree is a new software program for inferring rooted species trees from collections of gene trees using the gene tree parsimony approach. The program implements a novel algorithm that significantly improves upon the run time of standard search heuristics for gene tree parsimony, and enables the first truly genome-scale phylogenetic analyses. In addition, DupTree allows users to examine alternate rootings and to weight the reconciliation costs for gene trees. DupTree is an open source project written in C++. AVAILABILITY: DupTree for Mac OS X, Windows, and Linux along with a sample dataset and an on-line manual are available at http://genome.cs.iastate.edu/CBL/DupTree CONTACT: oeulenst@cs.iastate.edu.</description>
    <dc:title>DupTree: A program for large-scale phylogenetic analyses using gene tree parsimony.</dc:title>

    <dc:creator>André Wehe</dc:creator>
    <dc:creator>Mukul S Bansal</dc:creator>
    <dc:creator>J Gordon Burleigh</dc:creator>
    <dc:creator>Oliver Eulenstein</dc:creator>
    <dc:source>Bioinformatics (Oxford, England) (12 May 2008)</dc:source>
    <dc:date>2008-05-14T21:08:59-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Bioinformatics (Oxford, England)</prism:publicationName>
    <prism:issn>1460-2059</prism:issn>
    <prism:category>duplication</prism:category>
    <prism:category>gene_duplication</prism:category>
    <prism:category>gene_tree</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>phylogenetics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2768977">
    <title>An endogenous small interfering RNA pathway in Drosophila</title>
    <link>http://www.citeulike.org/user/stajich/article/2768977</link>
    <description>&lt;i&gt;Nature (07 May 2008)&lt;/i&gt;</description>
    <dc:title>An endogenous small interfering RNA pathway in Drosophila</dc:title>

    <dc:creator>Benjamin Czech</dc:creator>
    <dc:creator>Colin Malone</dc:creator>
    <dc:creator>Rui Zhou</dc:creator>
    <dc:creator>Alexander Stark</dc:creator>
    <dc:creator>Catherine Schlingeheyde</dc:creator>
    <dc:creator>Monica Dus</dc:creator>
    <dc:creator>Norbert Perrimon</dc:creator>
    <dc:creator>Manolis Kellis</dc:creator>
    <dc:creator>James Wohlschlegel</dc:creator>
    <dc:creator>Ravi Sachidanandam</dc:creator>
    <dc:creator>Gregory Hannon</dc:creator>
    <dc:creator>Julius Brennecke</dc:creator>
    <dc:identifier>doi:10.1038/nature07007</dc:identifier>
    <dc:source>Nature (07 May 2008)</dc:source>
    <dc:date>2008-05-08T08:03:27-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Nature</prism:publicationName>
    <prism:issn>0028-0836</prism:issn>
    <prism:publisher>Nature Publishing Group</prism:publisher>
    <prism:category>drosophila</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>mirna</prism:category>
    <prism:category>smallrna</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2781969">
    <title>The McDonald-Kreitman Test and Slightly Deleterious Mutations</title>
    <link>http://www.citeulike.org/user/stajich/article/2781969</link>
    <description>&lt;i&gt;Mol Biol Evol, Vol. 25, No. 6. (1 June 2008), pp. 1007-1015.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;It is possible to estimate the proportion of substitutions that are due to adaptive evolution using the numbers of silent and nonsilent polymorphisms and substitutions in a McDonald and Kreitman-type analysis. Unfortunately, this estimate of adaptive evolution is biased downward by the segregation of slightly deleterious mutations. It has been suggested that 1 way to cope with the effects of these slightly deleterious mutations is to remove low-frequency polymorphisms from the analysis. We investigate the performance of this method theoretically. We show that although removing low-frequency polymorphisms does indeed reduce the bias in the estimate of adaptive evolution, the estimate is always downwardly biased, often to the extent that one would not be able to detect adaptive evolution, even if it existed. The method is reasonably satisfactory, only if the rate of adaptive evolution is high and the distribution of fitness effects for slightly deleterious mutations is very leptokurtic. Our analysis suggests that adaptive evolution could be quite prevalent in humans (&#62;8%) and still not be detectable using current methodologies. Our analysis also suggests that the level of adaptive evolution has probably been underestimated, possibly substantially, in both bacteria and Drosophila. 10.1093/molbev/msn005</description>
    <dc:title>The McDonald-Kreitman Test and Slightly Deleterious Mutations</dc:title>

    <dc:creator>Jane Charlesworth</dc:creator>
    <dc:creator>Adam Eyre-Walker</dc:creator>
    <dc:identifier>doi:10.1093/molbev/msn005</dc:identifier>
    <dc:source>Mol Biol Evol, Vol. 25, No. 6. (1 June 2008), pp. 1007-1015.</dc:source>
    <dc:date>2008-05-09T23:25:21-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Mol Biol Evol</prism:publicationName>
    <prism:volume>25</prism:volume>
    <prism:number>6</prism:number>
    <prism:startingPage>1007</prism:startingPage>
    <prism:endingPage>1015</prism:endingPage>
    <prism:category>adaptation</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>molecular_evolution</prism:category>
    <prism:category>population_genetics</prism:category>
    <prism:category>population_genomics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2739713">
    <title>GENOMEPOP: A program to simulate genomes in populations</title>
    <link>http://www.citeulike.org/user/stajich/article/2739713</link>
    <description>&lt;i&gt;BMC Bioinformatics, Vol. 9, No. 1. (2008)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;BACKGROUND:There are several situations in population biology research where simulating DNA sequences is useful. Simulation of biological populations under different evolutionary genetic models can be undertaken using backward or forward strategies. Backward simulations, also called coalescent-based simulations, are computationally efficient. The reason is that they are based on the history of lineages with surviving offspring in the current population. On the contrary, forward simulations are less efficient because the entire population is simulated from past to present. However, the coalescent framework imposes some limitations that forward simulation does not. Hence, there is an increasing interest in forward population genetic simulation and efficient new tools have been developed recently. Software tools that allow efficient simulation of large DNA fragments under complex evolutionary models will be very helpful when trying to better understand the trace left on the DNA by the different interacting evolutionary forces. Here I will introduce GenomePop, a forward simulation program that fulfills the above requirements. The use of the program is demonstrated by studying the impact of intracodon recombination on global and site-specific dN/dS estimation.RESULTS:I have developed algorithms and written software to efficiently simulate, forward in time, different Markovian nucleotide or codon models of DNA mutation. Such models can be combined with recombination, at inter and intra codon levels, fitness-based selection and complex demographic scenarios.CONCLUSIONS:GenomePop has many interesting characteristics for simulating SNPs or DNA sequences under complex evolutionary and demographic models. These features make it unique with respect to other simulation tools. Namely, the possibility of forward simulation under General Time Reversible (GTR) mutation or GTRxMG94 codon models with intra-codon recombination, arbitrary, user-defined, migration patterns, diploid or haploid models, constant or variable population sizes, etc. It also allows simulation of fitness-based selection under different distributions of mutational effects. Under the 2-allele model it allows the simulation of recombination hot-spots, the definition of different frequencies in different populations, etc. GenomePop can also manage large DNA fragments. In addition, it has a scaling option to save computation time when simulating large sequences and population sizes under complex demographic and evolutionary situations. These and many other features are detailed in its web page [1].</description>
    <dc:title>GENOMEPOP: A program to simulate genomes in populations</dc:title>

    <dc:creator>Antonio Rodriguez</dc:creator>
    <dc:identifier>doi:10.1186/1471-2105-9-223</dc:identifier>
    <dc:source>BMC Bioinformatics, Vol. 9, No. 1. (2008)</dc:source>
    <dc:date>2008-04-30T18:51:36-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>BMC Bioinformatics</prism:publicationName>
    <prism:volume>9</prism:volume>
    <prism:number>1</prism:number>
    <prism:category>bioinformatics</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>population_genomics</prism:category>
    <prism:category>simulation</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2640090">
    <title>Discovering microRNAs from deep sequencing data using miRDeep</title>
    <link>http://www.citeulike.org/user/stajich/article/2640090</link>
    <description>&lt;i&gt;Nature Biotechnology, Vol. 26, No. 4., pp. 407-415.&lt;/i&gt;</description>
    <dc:title>Discovering microRNAs from deep sequencing data using miRDeep</dc:title>

    <dc:creator>Marc Friedländer</dc:creator>
    <dc:creator>Wei Chen</dc:creator>
    <dc:creator>Catherine Adamidi</dc:creator>
    <dc:creator>Jonas Maaskola</dc:creator>
    <dc:creator>Ralf Einspanier</dc:creator>
    <dc:creator>Signe Knespel</dc:creator>
    <dc:creator>Nikolaus Rajewsky</dc:creator>
    <dc:identifier>doi:10.1038/nbt1394</dc:identifier>
    <dc:source>Nature Biotechnology, Vol. 26, No. 4., pp. 407-415.</dc:source>
    <dc:date>2008-04-08T04:45:36-00:00</dc:date>
    <prism:publicationName>Nature Biotechnology</prism:publicationName>
    <prism:issn>1087-0156</prism:issn>
    <prism:volume>26</prism:volume>
    <prism:number>4</prism:number>
    <prism:startingPage>407</prism:startingPage>
    <prism:endingPage>415</prism:endingPage>
    <prism:publisher>Nature Publishing Group</prism:publisher>
    <prism:category>evolution</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>mirna</prism:category>
    <prism:category>smallrna</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2410420">
    <title>The birth and death of microRNA genes in Drosophila.</title>
    <link>http://www.citeulike.org/user/stajich/article/2410420</link>
    <description>&lt;i&gt;Nat Genet (17 February 2008)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;MicroRNAs (miRNAs) are small, endogenously expressed RNAs that regulate mRNAs post-transcriptionally. The class of miRNA genes, like other gene classes, should experience birth, death and persistence of its members. We carried out deep sequencing of miRNAs from three species of Drosophila, and obtained 107,000 sequences that map to no fewer than 300 loci that were not previously known. We observe a large class of miRNA genes that are evolutionarily young, with a rate of birth of 12 new genes per million years (Myr). Most of these new miRNAs originated from non-miRNA sequences. Among the new genes, we estimate that 96% disappeared quickly in the course of evolution; only 4% of new miRNA genes were retained by natural selection. Furthermore, only 60% of these retained genes became integrated into the transcriptome in the long run (60 Myr). This small fraction (2.5%) of surviving miRNAs may later on become moderately or highly expressed. Our results suggest that there is a high birth rate of new miRNA genes, accompanied by a comparably high death rate. The estimated net gain of long-lived miRNA genes, which is not strongly affected by either the depth or the breadth (number of tissues) of sequencing, is 0.3 genes per Myr in Drosophila.</description>
    <dc:title>The birth and death of microRNA genes in Drosophila.</dc:title>

    <dc:creator>Jian Lu</dc:creator>
    <dc:creator>Yang Shen</dc:creator>
    <dc:creator>Qingfa Wu</dc:creator>
    <dc:creator>Supriya Kumar</dc:creator>
    <dc:creator>Bin He</dc:creator>
    <dc:creator>Suhua Shi</dc:creator>
    <dc:creator>Richard W Carthew</dc:creator>
    <dc:creator>San Ming Wang</dc:creator>
    <dc:creator>Chung-I Wu</dc:creator>
    <dc:identifier>doi:10.1038/ng.73</dc:identifier>
    <dc:source>Nat Genet (17 February 2008)</dc:source>
    <dc:date>2008-02-22T08:12:45-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Nat Genet</prism:publicationName>
    <prism:issn>1546-1718</prism:issn>
    <prism:category>drosophila</prism:category>
    <prism:category>evolution</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>mirna</prism:category>
    <prism:category>smallrna</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2341756">
    <title>Statistical issues in the analysis of Illumina data</title>
    <link>http://www.citeulike.org/user/stajich/article/2341756</link>
    <description>&lt;i&gt;BMC Bioinformatics, Vol. 9, No. 1. (2008)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;BACKGROUND:Illumina bead-based arrays are becoming increasingly popular due to their high degree of replication and reported high data quality. However, little attention has been paid to the pre-processing of Illumina data. In this paper, we present our experience of analysing the raw data from an Illumina spike-in experiment and offer guidelines for those wishing to analyse expression data or develop new methodologies for this technology.RESULTS:We find that the local background estimated by Illumina is consistently low, and subtracting this background is beneficial for detecting differential expression (DE). Illumina's summary method performs well at removing outliers; producing estimates which are less biased and are less variable than other robust summary methods. However, quality assessment on summarised data may miss spatial artefacts present in the raw data. Also, we find that the background normalisation method used in Illumina's proprietary software (BeadStudio) can cause problems with a standard DE analysis. We demonstrate that variances calculated from the raw data can be used as inverse weights in the DE analysis to improve power. Finally, variability in both expression levels and DE statistics can be attributed to differences in probe composition. These differences are not accounted for by current analysis methods and require further investigation.CONCLUSIONS:Analysing Illumina expression data using BeadStudio is reasonable because of the conservative estimates of summary values produced by the software. Improvements can however be made by not using background normalisation. Access to the raw data allows for a more detailed quality assessment and flexible analyses. In the case of a gene expression study, data can be analysed on an appropriate scale using established tools. Similar improvements can be expected for other Illumina assays.</description>
    <dc:title>Statistical issues in the analysis of Illumina data</dc:title>

    <dc:creator>Mark Dunning</dc:creator>
    <dc:creator>Nuno Morais</dc:creator>
    <dc:creator>Andy Lynch</dc:creator>
    <dc:creator>Simon Tavare</dc:creator>
    <dc:creator>Matthew Ritchie</dc:creator>
    <dc:identifier>doi:10.1186/1471-2105-9-85</dc:identifier>
    <dc:source>BMC Bioinformatics, Vol. 9, No. 1. (2008)</dc:source>
    <dc:date>2008-02-06T14:10:49-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>BMC Bioinformatics</prism:publicationName>
    <prism:volume>9</prism:volume>
    <prism:number>1</prism:number>
    <prism:category>array</prism:category>
    <prism:category>illumina</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>shortread</prism:category>
    <prism:category>statistics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/967505">
    <title>Selection of conserved blocks from multiple alignments for their use in phylogenetic analysis.</title>
    <link>http://www.citeulike.org/user/stajich/article/967505</link>
    <description>&lt;i&gt;Mol Biol Evol, Vol. 17, No. 4. (April 2000), pp. 540-552.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The use of some multiple-sequence alignments in phylogenetic analysis, particularly those that are not very well conserved, requires the elimination of poorly aligned positions and divergent regions, since they may not be homologous or may have been saturated by multiple substitutions. A computerized method that eliminates such positions and at the same time tries to minimize the loss of informative sites is presented here. The method is based on the selection of blocks of positions that fulfill a simple set of requirements with respect to the number of contiguous conserved positions, lack of gaps, and high conservation of flanking positions, making the final alignment more suitable for phylogenetic analysis. To illustrate the efficiency of this method, alignments of 10 mitochondrial proteins from several completely sequenced mitochondrial genomes belonging to diverse eukaryotes were used as examples. The percentages of removed positions were higher in the most divergent alignments. After removing divergent segments, the amino acid composition of the different sequences was more uniform, and pairwise distances became much smaller. Phylogenetic trees show that topologies can be different after removing conserved blocks, particularly when there are several poorly resolved nodes. Strong support was found for the grouping of animals and fungi but not for the position of more basal eukaryotes. The use of a computerized method such as the one presented here reduces to a certain extent the necessity of manually editing multiple alignments, makes the automation of phylogenetic analysis of large data sets feasible, and facilitates the reproduction of the final alignment by other researchers.</description>
    <dc:title>Selection of conserved blocks from multiple alignments for their use in phylogenetic analysis.</dc:title>

    <dc:creator>J Castresana</dc:creator>
    <dc:source>Mol Biol Evol, Vol. 17, No. 4. (April 2000), pp. 540-552.</dc:source>
    <dc:date>2006-11-29T23:03:56-00:00</dc:date>
    <prism:publicationYear>2000</prism:publicationYear>
    <prism:publicationName>Mol Biol Evol</prism:publicationName>
    <prism:issn>0737-4038</prism:issn>
    <prism:volume>17</prism:volume>
    <prism:number>4</prism:number>
    <prism:startingPage>540</prism:startingPage>
    <prism:endingPage>552</prism:endingPage>
    <prism:category>alignment</prism:category>
    <prism:category>coccipaper</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>phylogenetics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/236100">
    <title>ProbCons: Probabilistic consistency-based multiple sequence alignment.</title>
    <link>http://www.citeulike.org/user/stajich/article/236100</link>
    <description>&lt;i&gt;Genome Res, Vol. 15, No. 2. (February 2005), pp. 330-340.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;To study gene evolution across a wide range of organisms, biologists need accurate tools for multiple sequence alignment of protein families. Obtaining accurate alignments, however, is a difficult computational problem because of not only the high computational cost but also the lack of proper objective functions for measuring alignment quality. In this paper, we introduce probabilistic consistency, a novel scoring function for multiple sequence comparisons. We present ProbCons, a practical tool for progressive protein multiple sequence alignment based on probabilistic consistency, and evaluate its performance on several standard alignment benchmark data sets. On the BAliBASE, SABmark, and PREFAB benchmark alignment databases, ProbCons achieves statistically significant improvement over other leading methods while maintaining practical speed. ProbCons is publicly available as a Web resource.</description>
    <dc:title>ProbCons: Probabilistic consistency-based multiple sequence alignment.</dc:title>

    <dc:creator>CB Do</dc:creator>
    <dc:creator>MS Mahabhashyam</dc:creator>
    <dc:creator>M Brudno</dc:creator>
    <dc:creator>S Batzoglou</dc:creator>
    <dc:identifier>doi:10.1101/gr.2821705</dc:identifier>
    <dc:source>Genome Res, Vol. 15, No. 2. (February 2005), pp. 330-340.</dc:source>
    <dc:date>2005-06-24T12:35:53-00:00</dc:date>
    <prism:publicationYear>2005</prism:publicationYear>
    <prism:publicationName>Genome Res</prism:publicationName>
    <prism:issn>1088-9051</prism:issn>
    <prism:volume>15</prism:volume>
    <prism:number>2</prism:number>
    <prism:startingPage>330</prism:startingPage>
    <prism:endingPage>340</prism:endingPage>
    <prism:category>alignment</prism:category>
    <prism:category>coccipaper</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>multiple_sequence_alignment</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/408246">
    <title>PAML: a program package for phylogenetic analysis by maximum likelihood.</title>
    <link>http://www.citeulike.org/user/stajich/article/408246</link>
    <description>&lt;i&gt;Comput Appl Biosci, Vol. 13, No. 5. (October 1997), pp. 555-556.&lt;/i&gt;</description>
    <dc:title>PAML: a program package for phylogenetic analysis by maximum likelihood.</dc:title>

    <dc:creator>Z Yang</dc:creator>
    <dc:source>Comput Appl Biosci, Vol. 13, No. 5. (October 1997), pp. 555-556.</dc:source>
    <dc:date>2005-11-25T12:56:07-00:00</dc:date>
    <prism:publicationYear>1997</prism:publicationYear>
    <prism:publicationName>Comput Appl Biosci</prism:publicationName>
    <prism:issn>0266-7061</prism:issn>
    <prism:volume>13</prism:volume>
    <prism:number>5</prism:number>
    <prism:startingPage>555</prism:startingPage>
    <prism:endingPage>556</prism:endingPage>
    <prism:category>coccipaper</prism:category>
    <prism:category>codon_model</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>molecular_evolution</prism:category>
    <prism:category>paml</prism:category>
    <prism:category>phylogenetics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2607945">
    <title>Levers and fulcrums: progress in cis-regulatory motif models</title>
    <link>http://www.citeulike.org/user/stajich/article/2607945</link>
    <description>&lt;i&gt;Nature Methods, Vol. 5, No. 4., pp. 297-298.&lt;/i&gt;</description>
    <dc:title>Levers and fulcrums: progress in cis-regulatory motif models</dc:title>

    <dc:creator>Ewan Birney</dc:creator>
    <dc:identifier>doi:10.1038/nmeth0408-297</dc:identifier>
    <dc:source>Nature Methods, Vol. 5, No. 4., pp. 297-298.</dc:source>
    <dc:date>2008-03-28T16:35:47-00:00</dc:date>
    <prism:publicationName>Nature Methods</prism:publicationName>
    <prism:issn>1548-7091</prism:issn>
    <prism:volume>5</prism:volume>
    <prism:number>4</prism:number>
    <prism:startingPage>297</prism:startingPage>
    <prism:endingPage>298</prism:endingPage>
    <prism:publisher>Nature Publishing Group</prism:publisher>
    <prism:category>cis-regulatory</prism:category>
    <prism:category>comparative_genomics</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>motif</prism:category>
    <prism:category>prediction</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2600818">
    <title>Using ESTs for phylogenomics: can one accurately infer a phylogenetic tree from a gappy alignment?</title>
    <link>http://www.citeulike.org/user/stajich/article/2600818</link>
    <description>&lt;i&gt;BMC Evolutionary Biology, Vol. 8 (26 March 2008), 95.&lt;/i&gt;</description>
    <dc:title>Using ESTs for phylogenomics: can one accurately infer a phylogenetic tree from a gappy alignment?</dc:title>

    <dc:creator>Stefanie Hartmann</dc:creator>
    <dc:creator>Todd Vision</dc:creator>
    <dc:identifier>doi:10.1186/1471-2148-8-95</dc:identifier>
    <dc:source>BMC Evolutionary Biology, Vol. 8 (26 March 2008), 95.</dc:source>
    <dc:date>2008-03-27T06:01:33-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>BMC Evolutionary Biology</prism:publicationName>
    <prism:issn>1471-2148</prism:issn>
    <prism:volume>8</prism:volume>
    <prism:startingPage>95</prism:startingPage>
    <prism:category>methods</prism:category>
    <prism:category>phylogenomics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2534847">
    <title>Testing Congruence in Phylogenomic Analysis</title>
    <link>http://www.citeulike.org/user/stajich/article/2534847</link>
    <description>&lt;i&gt;Systematic Biology, Vol. 57, No. 1. (2008), pp. 104-115.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Phylogenomic analyses of large sets of genes or proteins have the potential to revolutionize our understanding of the tree of life. However, problems arise because estimated phylogenies from individual loci often differ because of different histories, systematic bias, or stochastic error. We have developed Concaterpillar, a hierarchical clustering method based on likelihood-ratio testing that identifies congruent loci for phylogenomic analysis. Concaterpillar also includes a test for shared relative evolutionary rates between genes indicating whether they should be analyzed separately or by concatenation. In simulation studies, the performance of this method is excellent when a multiple comparison correction is applied. We analyzed a phylogenomic data set of 60 translational protein sequences from the major supergroups of eukaryotes and identified three congruent subsets of proteins. Analysis of the largest set indicates improved congruence relative to the full data set and produced a phylogeny with stronger support for five eukaryote supergroups including the Opisthokonts, the Plantae, the stramenopiles + Apicomplexa (chromalveolates), the Amoebozoa, and the Excavata. In contrast, the phylogeny of the second largest set indicates a close relationship between stramenopiles and red algae, to the exclusion of alveolates, suggesting gene transfer from the red algal secondary symbiont to the ancestral stramenopile host nucleus during the origin of their chloroplast. Investigating phylogenomic data sets for conflicting signals has the potential to both improve phylogenetic accuracy and inform our understanding of genome evolution.</description>
    <dc:title>Testing Congruence in Phylogenomic Analysis</dc:title>

    <dc:creator>Jessica Leigh</dc:creator>
    <dc:creator>Edward Susko</dc:creator>
    <dc:creator>Manuela Baumgartner</dc:creator>
    <dc:creator>Andrew Roger</dc:creator>
    <dc:identifier>doi:10.1080/10635150801910436</dc:identifier>
    <dc:source>Systematic Biology, Vol. 57, No. 1. (2008), pp. 104-115.</dc:source>
    <dc:date>2008-03-14T21:08:13-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Systematic Biology</prism:publicationName>
    <prism:volume>57</prism:volume>
    <prism:number>1</prism:number>
    <prism:startingPage>104</prism:startingPage>
    <prism:endingPage>115</prism:endingPage>
    <prism:publisher>Taylor &#38; Francis</prism:publisher>
    <prism:category>methods</prism:category>
    <prism:category>phylogenetics</prism:category>
    <prism:category>phylogenomics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2230351">
    <title>A phylogenomic investigation into the origin of Metazoa</title>
    <link>http://www.citeulike.org/user/stajich/article/2230351</link>
    <description>&lt;i&gt;Mol Biol Evol (9 January 2008), msn006.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The evolution of multicellular animals (Metazoa) from their unicellular ancestors was a key transition that was accompanied by the emergence and diversification of gene families associated with multicellularity. To clarify the timing and order of specific events in this transition, we conducted expressed sequence tag (EST) surveys on four putative protistan relatives of Metazoa including the choanoflagellate Monosiga ovata, the ichthyosporeans Sphaeroforma arctica and Amoebidium parasiticum and the amoeba Capsaspora owczarzaki, and two members of Amoebozoa, Acanthamoeba castellanii and Mastigamoeba balamuthi. We find that homologs of genes involved in metazoan multicellularity exist in several of these unicellular organisms, including one encoding a membrane-associated guanylate kinase (MAGI) in Capsaspora. In Metazoa, MAGI regulates tight junctions involved in cell-cell communication. By phylogenomic analyses of genes encoded in nuclear and mitochondrial genomes we show that the choanoflagellates are the closest relatives of the Metazoa, followed by the Capsaspora and Icthyosporea lineages, although the branching order between the latter two groups remains unclear. Understanding the function of metazoan-specific' proteins we have identified in these protists will clarify the evolutionary steps that led to the emergence of the Metazoa. 10.1093/molbev/msn006</description>
    <dc:title>A phylogenomic investigation into the origin of Metazoa</dc:title>

    <dc:creator>Inaki Ruiz-Trillo</dc:creator>
    <dc:creator>Andrew Roger</dc:creator>
    <dc:creator>Gertraud Burger</dc:creator>
    <dc:creator>Michael Gray</dc:creator>
    <dc:creator>Franz Lang</dc:creator>
    <dc:identifier>doi:10.1093/molbev/msn006</dc:identifier>
    <dc:source>Mol Biol Evol (9 January 2008), msn006.</dc:source>
    <dc:date>2008-01-14T12:46:54-00:00</dc:date>
    <prism:publicationYear>2008</prism:publicationYear>
    <prism:publicationName>Mol Biol Evol</prism:publicationName>
    <prism:startingPage>msn006</prism:startingPage>
    <prism:category>evodevo</prism:category>
    <prism:category>evolution</prism:category>
    <prism:category>metazoa</prism:category>
    <prism:category>metazoan</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>multicellular</prism:category>
    <prism:category>phylogenomics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2060806">
    <title>Searching for statistically significant regulatory modules.</title>
    <link>http://www.citeulike.org/user/stajich/article/2060806</link>
    <description>&lt;i&gt;Bioinformatics, Vol. 19 Suppl 2 (October 2003)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;MOTIVATION: The regulatory machinery controlling gene expression is complex, frequently requiring multiple, simultaneous DNA-protein interactions. The rate at which a gene is transcribed may depend upon the presence or absence of a collection of transcription factors bound to the DNA near the gene. Locating transcription factor binding sites in genomic DNA is difficult because the individual sites are small and tend to occur frequently by chance. True binding sites may be identified by their tendency to occur in clusters, sometimes known as regulatory modules. RESULTS: We describe an algorithm for detecting occurrences of regulatory modules in genomic DNA. The algorithm, called mcast, takes as input a DNA database and a collection of binding site motifs that are known to operate in concert. mcast uses a motif-based hidden Markov model with several novel features. The model incorporates motif-specific p-values, thereby allowing scores from motifs of different widths and specificities to be compared directly. The p-value scoring also allows mcast to only accept motif occurrences with significance below a user-specified threshold, while still assigning better scores to motif occurrences with lower p-values. mcast can search long DNA sequences, modeling length distributions between motifs within a regulatory module, but ignoring length distributions between modules. The algorithm produces a list of predicted regulatory modules, ranked by E-value. We validate the algorithm using simulated data as well as real data sets from fruitfly and human. AVAILABILITY: http://meme.sdsc.edu/MCAST/paper</description>
    <dc:title>Searching for statistically significant regulatory modules.</dc:title>

    <dc:creator>TL Bailey</dc:creator>
    <dc:creator>WS Noble</dc:creator>
    <dc:source>Bioinformatics, Vol. 19 Suppl 2 (October 2003)</dc:source>
    <dc:date>2007-12-05T13:00:32-00:00</dc:date>
    <prism:publicationYear>2003</prism:publicationYear>
    <prism:publicationName>Bioinformatics</prism:publicationName>
    <prism:issn>1460-2059</prism:issn>
    <prism:volume>19 Suppl 2</prism:volume>
    <prism:category>cis-regulatory</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>module</prism:category>
    <prism:category>motif</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/2017324">
    <title>New Approaches to Phylogenetic Tree Search and Their Application to Large Numbers of Protein Alignments</title>
    <link>http://www.citeulike.org/user/stajich/article/2017324</link>
    <description>&lt;i&gt;Systematic Biology, Vol. 56, No. 5. (2007), pp. 727-740.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Phylogenetic tree estimation plays a critical role in a wide variety of molecular studies, including molecular systematics, phylogenetics, and comparative genomics. Finding the optimal tree relating a set of sequences using score-based (optimality criterion) methods, such as maximum likelihood and maximum parsimony, may require all possible trees to be considered, which is not feasible even for modest numbers of sequences. In practice, trees are estimated using heuristics that represent a trade-off between topological accuracy and speed. I present a series of novel algorithms suitable for score-based phylogenetic tree reconstruction that demonstrably improve the accuracy of tree estimates while maintaining high computational speeds. The heuristics function by allowing the efficient exploration of large numbers of trees through novel hill-climbing and resampling strategies. These heuristics, and other computational approximations, are implemented for maximum likelihood estimation of trees in the program Leaphy, and its performance is compared to other popular phylogenetic programs. Trees are estimated from 4059 different protein alignments using a selection of phylogenetic programs and the likelihoods of the tree estimates are compared. Trees estimated using Leaphy are found to have equal to or better likelihoods than trees estimated using other phylogenetic programs in 4004 (98.6%) families and provide a unique best tree that no other program found in 1102 (27.1%) families. The improvement is particularly marked for larger families (80 to 100 sequences), where Leaphy finds a unique best tree in 81.7% of families.</description>
    <dc:title>New Approaches to Phylogenetic Tree Search and Their Application to Large Numbers of Protein Alignments</dc:title>

    <dc:creator>Simon Whelan</dc:creator>
    <dc:identifier>doi:10.1080/10635150701611134</dc:identifier>
    <dc:source>Systematic Biology, Vol. 56, No. 5. (2007), pp. 727-740.</dc:source>
    <dc:date>2007-11-29T17:39:41-00:00</dc:date>
    <prism:publicationYear>2007</prism:publicationYear>
    <prism:publicationName>Systematic Biology</prism:publicationName>
    <prism:volume>56</prism:volume>
    <prism:number>5</prism:number>
    <prism:startingPage>727</prism:startingPage>
    <prism:endingPage>740</prism:endingPage>
    <prism:publisher>Taylor &#38; Francis</prism:publisher>
    <prism:category>methods</prism:category>
    <prism:category>phylogenetics</prism:category>
    <prism:category>phylogenomics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/1928496">
    <title>An ant colony optimization algorithm for phylogenetic estimation under the minimum evolution principle</title>
    <link>http://www.citeulike.org/user/stajich/article/1928496</link>
    <description>&lt;i&gt;BMC Evolutionary Biology, Vol. 7 (15 November 2007), 228.&lt;/i&gt;</description>
    <dc:title>An ant colony optimization algorithm for phylogenetic estimation under the minimum evolution principle</dc:title>

    <dc:creator>Daniele Catanzaro</dc:creator>
    <dc:creator>Raffaele Pesenti</dc:creator>
    <dc:creator>Michel Milinkovitch</dc:creator>
    <dc:identifier>doi:10.1186/1471-2148-7-228</dc:identifier>
    <dc:source>BMC Evolutionary Biology, Vol. 7 (15 November 2007), 228.</dc:source>
    <dc:date>2007-11-16T21:55:31-00:00</dc:date>
    <prism:publicationYear>2007</prism:publicationYear>
    <prism:publicationName>BMC Evolutionary Biology</prism:publicationName>
    <prism:issn>1471-2148</prism:issn>
    <prism:volume>7</prism:volume>
    <prism:startingPage>228</prism:startingPage>
    <prism:category>methods</prism:category>
    <prism:category>phylogenetics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/1939394">
    <title>MAKER: An easy-to-use annotation pipeline designed for emerging model organism genomes</title>
    <link>http://www.citeulike.org/user/stajich/article/1939394</link>
    <description>&lt;i&gt;Genome Res. (19 November 2007), gr.6743907.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;We have developed a portable and easily configurable genome annotation pipeline called MAKER. Its purpose is to allow investigators to independently annotate eukaryotic genomes and create genome databases. MAKER identifies repeats, aligns ESTs and proteins to a genome, produces ab initio gene predictions, and automatically synthesizes these data into gene annotations having evidence-based quality indices. MAKER is also easily trainable: Outputs of preliminary runs are used to automatically retrain its gene-prediction algorithm, producing higher-quality gene-models on subsequent runs. MAKER's inputs are minimal, and its outputs can be used to create a GMOD database. Its outputs can also be viewed in the Apollo Genome browser; this feature of MAKER provides an easy means to annotate, view, and edit individual contigs and BACs without the overhead of a database. As proof of principle, we have used MAKER to annotate the genome of the planarian Schmidtea mediterranea and to create a new genome database, SmedGD. We have also compared MAKER's performance to other published annotation pipelines. Our results demonstrate that MAKER provides a simple and effective means to convert a genome sequence into a community-accessible genome database. MAKER should prove especially useful for emerging model organism genome projects for which extensive bioinformatics resources may not be readily available. 10.1101/gr.6743907</description>
    <dc:title>MAKER: An easy-to-use annotation pipeline designed for emerging model organism genomes</dc:title>

    <dc:creator>Brandi Cantarel</dc:creator>
    <dc:creator>Ian Korf</dc:creator>
    <dc:creator>Sofia Robb</dc:creator>
    <dc:creator>Genis Parra</dc:creator>
    <dc:creator>Eric Ross</dc:creator>
    <dc:creator>Barry Moore</dc:creator>
    <dc:creator>Carson Holt</dc:creator>
    <dc:creator>Sanchez</dc:creator>
    <dc:creator>Mark Yandell</dc:creator>
    <dc:identifier>doi:10.1101/gr.6743907</dc:identifier>
    <dc:source>Genome Res. (19 November 2007), gr.6743907.</dc:source>
    <dc:date>2007-11-19T21:02:36-00:00</dc:date>
    <prism:publicationYear>2007</prism:publicationYear>
    <prism:publicationName>Genome Res.</prism:publicationName>
    <prism:startingPage>gr.6743907</prism:startingPage>
    <prism:category>gene_finding</prism:category>
    <prism:category>genome_annotation</prism:category>
    <prism:category>methods</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/1702751">
    <title>Detecting and overcoming systematic errors in genome-scale phylogenies.</title>
    <link>http://www.citeulike.org/user/stajich/article/1702751</link>
    <description>&lt;i&gt;Syst Biol, Vol. 56, No. 3. (June 2007), pp. 389-399.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Genome-scale data sets result in an enhanced resolution of the phylogenetic inference by reducing stochastic errors. However, there is also an increase of systematic errors due to model violations, which can lead to erroneous phylogenies. Here, we explore the impact of systematic errors on the resolution of the eukaryotic phylogeny using a data set of 143 nuclear-encoded proteins from 37 species. The initial observation was that, despite the impressive amount of data, some branches had no significant statistical support. To demonstrate that this lack of resolution is due to a mutual annihilation of phylogenetic and nonphylogenetic signals, we created a series of data sets with slightly different taxon sampling. As expected, these data sets yielded strongly supported but mutually exclusive trees, thus confirming the presence of conflicting phylogenetic and nonphylogenetic signals in the original data set. To decide on the correct tree, we applied several methods expected to reduce the impact of some kinds of systematic error. Briefly, we show that (i) removing fast-evolving positions, (ii) recoding amino acids into functional categories, and (iii) using a site-heterogeneous mixture model (CAT) are three effective means of increasing the ratio of phylogenetic to nonphylogenetic signal. Finally, our results allow us to formulate guidelines for detecting and overcoming phylogenetic artefacts in genome-scale phylogenetic analyses.</description>
    <dc:title>Detecting and overcoming systematic errors in genome-scale phylogenies.</dc:title>

    <dc:creator>N Rodríguez-Ezpeleta</dc:creator>
    <dc:creator>H Brinkmann</dc:creator>
    <dc:creator>B Roure</dc:creator>
    <dc:creator>N Lartillot</dc:creator>
    <dc:creator>BF Lang</dc:creator>
    <dc:creator>H Philippe</dc:creator>
    <dc:identifier>doi:10.1080/10635150701397643</dc:identifier>
    <dc:source>Syst Biol, Vol. 56, No. 3. (June 2007), pp. 389-399.</dc:source>
    <dc:date>2007-09-27T20:07:19-00:00</dc:date>
    <prism:publicationYear>2007</prism:publicationYear>
    <prism:publicationName>Syst Biol</prism:publicationName>
    <prism:issn>1063-5157</prism:issn>
    <prism:volume>56</prism:volume>
    <prism:number>3</prism:number>
    <prism:startingPage>389</prism:startingPage>
    <prism:endingPage>399</prism:endingPage>
    <prism:category>methods</prism:category>
    <prism:category>phylogenetics</prism:category>
    <prism:category>phylogenomics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/1853098">
    <title>Evaluation of the models handling heterotachy in phylogenetic inference</title>
    <link>http://www.citeulike.org/user/stajich/article/1853098</link>
    <description>&lt;i&gt;BMC Evolutionary Biology, Vol. 7 (01 November 2007), 206.&lt;/i&gt;</description>
    <dc:title>Evaluation of the models handling heterotachy in phylogenetic inference</dc:title>

    <dc:creator>Yan Zhou</dc:creator>
    <dc:creator>Nicolas Rodrigue</dc:creator>
    <dc:creator>Nicolas Lartillot</dc:creator>
    <dc:creator>Herve Philippe</dc:creator>
    <dc:identifier>doi:10.1186/1471-2148-7-206</dc:identifier>
    <dc:source>BMC Evolutionary Biology, Vol. 7 (01 November 2007), 206.</dc:source>
    <dc:date>2007-11-01T21:37:10-00:00</dc:date>
    <prism:publicationYear>2007</prism:publicationYear>
    <prism:publicationName>BMC Evolutionary Biology</prism:publicationName>
    <prism:issn>1471-2148</prism:issn>
    <prism:volume>7</prism:volume>
    <prism:startingPage>206</prism:startingPage>
    <prism:category>heterotachy</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>phylogenetics</prism:category>
    <prism:category>phylogenomics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/614084">
    <title>Improved Consensus Network Techniques for Genome-Scale Phylogeny</title>
    <link>http://www.citeulike.org/user/stajich/article/614084</link>
    <description>&lt;i&gt;Molecular Biology and Evolution, Vol. 23, No. 5. (15 May 2006), pp. 848-855.&lt;/i&gt;</description>
    <dc:title>Improved Consensus Network Techniques for Genome-Scale Phylogeny</dc:title>

    <dc:creator>Barbara Holland</dc:creator>
    <dc:creator>Lars Jermiin</dc:creator>
    <dc:creator>Vincent Moulton</dc:creator>
    <dc:identifier>doi:10.1093/molbev/msj061</dc:identifier>
    <dc:source>Molecular Biology and Evolution, Vol. 23, No. 5. (15 May 2006), pp. 848-855.</dc:source>
    <dc:date>2006-05-05T05:46:17-00:00</dc:date>
    <prism:publicationYear>2006</prism:publicationYear>
    <prism:publicationName>Molecular Biology and Evolution</prism:publicationName>
    <prism:issn>0737-4038</prism:issn>
    <prism:volume>23</prism:volume>
    <prism:number>5</prism:number>
    <prism:startingPage>848</prism:startingPage>
    <prism:endingPage>855</prism:endingPage>
    <prism:publisher>Oxford University Press</prism:publisher>
    <prism:category>genome_evolution</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>phylogenetics</prism:category>
    <prism:category>phylogenomics</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/1341319">
    <title>Heads or Tails: A Simple Reliability Check for Multiple Sequence Alignments</title>
    <link>http://www.citeulike.org/user/stajich/article/1341319</link>
    <description>&lt;i&gt;Mol Biol Evol, Vol. 24, No. 6. (1 June 2007), pp. 1380-1383.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The question of multiple sequence alignment quality has received much attention from developers of alignment methods. Less forthcoming, however, are practical measures for addressing alignment quality issues in real life settings. Here, we present a simple methodology to help identify and quantify the uncertainties in multiple sequence alignments and their effects on subsequent analyses. The proposed methodology is based upon the a priori expectation that sequence alignment results should be independent of the orientation of the input sequences. Thus, for totally unambiguous cases, reversing residue order prior to alignment should yield an exact reversed alignment of that obtained by using the unreversed sequences. Such &#34;ideal&#34; alignments, however, are the exception in real life settings, and the two alignments, which we term the heads and tails alignments, are usually different to a greater or lesser degree. The degree of agreement or discrepancy between these two alignments may be used to assess the reliability of the sequence alignment. Furthermore, any alignment dependent sequence analysis protocol can be carried out separately for each of the two alignments, and the two sets of results may be compared with each other, providing us with valuable information regarding the robustness of the whole analytical process. The heads-or-tails (HoT) methodology can be easily implemented for any choice of alignment method and for any subsequent analytical protocol. We demonstrate the utility of HoT for phylogenetic reconstruction for the case of 130 sequences belonging to the chemoreceptor superfamily in Drosophila melanogaster, and by analysis of the BaliBASE alignment database. Surprisingly, Neighbor-Joining methods of phylogenetic reconstruction turned out to be less affected by alignment errors than maximum likelihood and Bayesian methods. 10.1093/molbev/msm060</description>
    <dc:title>Heads or Tails: A Simple Reliability Check for Multiple Sequence Alignments</dc:title>

    <dc:creator>Giddy Landan</dc:creator>
    <dc:creator>Dan Graur</dc:creator>
    <dc:identifier>doi:10.1093/molbev/msm060</dc:identifier>
    <dc:source>Mol Biol Evol, Vol. 24, No. 6. (1 June 2007), pp. 1380-1383.</dc:source>
    <dc:date>2007-05-29T15:18:23-00:00</dc:date>
    <prism:publicationYear>2007</prism:publicationYear>
    <prism:publicationName>Mol Biol Evol</prism:publicationName>
    <prism:volume>24</prism:volume>
    <prism:number>6</prism:number>
    <prism:startingPage>1380</prism:startingPage>
    <prism:endingPage>1383</prism:endingPage>
    <prism:category>alignment</prism:category>
    <prism:category>methods</prism:category>
    <prism:category>multiple_sequence_alignment</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/177005">
    <title>Complex early genes.</title>
    <link>http://www.citeulike.org/user/stajich/article/177005</link>
    <description>&lt;i&gt;Proc Natl Acad Sci U S A, Vol. 102, No. 6. (8 February 2005), pp. 1986-1991.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;We use the pattern of intron conservation in 684 groups of orthologs from seven fully sequenced eukaryotic genomes to provide maximum likelihood estimates of the number of introns present in the same orthologs in various eukaryotic ancestors. We find: (i) intron density in the plant-animal ancestor was high, perhaps two-thirds that of humans and three times that of Drosophila; and (ii) intron density in the ancestral bilateran was also high, equaling that of humans and four times that of Drosophila. We further find that modern introns are generally very old, with two-thirds of modern bilateran introns dating to the ancestral bilateran and two-fifths of modern plant, animal, and fungus introns dating to the plant-animal ancestor. Intron losses outnumber gains over a large range of eukaryotic lineages. These results show that early eukaryotic gene structures were very complex, and that simplification, not embellishment, has dominated subsequent evolution.</description>
    <dc:title>Complex early genes.</dc:title>

    <dc:creator>SW Roy</dc:creator>
    <dc:creator>W Gilbert</dc:creator>
    <dc:identifier>doi:10.1073/pnas.0408355101</dc:identifier>
    <dc:source>Proc Natl Acad Sci U S A, Vol. 102, No. 6. (8 February 2005), pp. 1986-1991.</dc:source>
    <dc:date>2005-05-03T07:17:47-00:00</dc:date>
    <prism:publicationYear>2005</prism:publicationYear>
    <prism:publicationName>Proc Natl Acad Sci U S A</prism:publicationName>
    <prism:issn>0027-8424</prism:issn>
    <prism:volume>102</prism:volume>
    <prism:number>6</prism:number>
    <prism:startingPage>1986</prism:startingPage>
    <prism:endingPage>1991</prism:endingPage>
    <prism:category>evolution</prism:category>
    <prism:category>intron</prism:category>
    <prism:category>intron_loss</prism:category>
    <prism:category>methods</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/1115595">
    <title>The pattern of intron loss</title>
    <link>http://www.citeulike.org/user/stajich/article/1115595</link>
    <description>&lt;i&gt;PNAS, Vol. 102, No. 3. (18 January 2005), pp. 713-718.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;We studied intron loss in 684 groups of orthologous genes from seven fully sequenced eukaryotic genomes. We found that introns closer to the 3' ends of genes are preferentially lost, as predicted if introns are lost through gene conversion with a reverse transcriptase product of a spliced mRNA. Adjacent introns tend to be lost in concert, as expected if such events span multiple intron positions. Directly contrary to the expectations of some, introns that do not interrupt codons (phase zero) are more, not less, likely to be lost, an intriguing and previously unappreciated result. Adjacent introns with matching phases are not more likely to be retained, as would be expected if they enjoyed a relative selective advantage. The findings of 3' and phase zero intron loss biases are in direct contradiction to an extremely recent study of fungi intron evolution. All patterns are less pronounced in the lineage leading to Caenorhabditis elegans, suggesting that the process of intron loss may be qualitatively different in nematodes. Our results support a reverse transcriptase-mediated model of intron loss. 10.1073/pnas.0408274102</description>
    <dc:title>The pattern of intron loss</dc:title>

    <dc:creator>Scott Roy</dc:creator>
    <dc:creator>Walter Gilbert</dc:creator>
    <dc:identifier>doi:10.1073/pnas.0408274102</dc:identifier>
    <dc:source>PNAS, Vol. 102, No. 3. (18 January 2005), pp. 713-718.</dc:source>
    <dc:date>2007-02-21T04:15:10-00:00</dc:date>
    <prism:publicationYear>2005</prism:publicationYear>
    <prism:publicationName>PNAS</prism:publicationName>
    <prism:volume>102</prism:volume>
    <prism:number>3</prism:number>
    <prism:startingPage>713</prism:startingPage>
    <prism:endingPage>718</prism:endingPage>
    <prism:category>evolution</prism:category>
    <prism:category>intron</prism:category>
    <prism:category>intron_loss</prism:category>
    <prism:category>methods</prism:category>
</item>



<item rdf:about="http://www.citeulike.org/user/stajich/article/1269642">
    <title>Lower Bounds on Multiple Sequence Alignment using Exact 3-way Alignment</title>
    <link>http://www.citeulike.org/user/stajich/article/1269642</link>
    <description>&lt;i&gt;BMC Bioinformatics, Vol. 8 (30 April 2007), 140.&lt;/i&gt;</description>
    <dc:title>Lower Bounds on Multiple Sequence Alignment using Exact 3-way Alignment</dc:title>

    <dc:creator>Charles Colbourn</dc:creator>
    <dc:creator>Sudhir Kumar</dc:creator>
    <dc:identifier>doi:10.1186/1471-2105-8-140</dc:identifier>
    <dc:source>BMC Bioinformatics, Vol. 8 (30 April 2007), 140.</dc:source>
    <dc:date>2007-05-01T04:38:36-00:00</dc:date>
    <prism:publicationYear>2007</prism:publicationYear>
    <prism:publicationName>BMC Bioinformatics</prism:publicationName>
    <prism:issn>1471-2105</prism:issn>
    <prism:volume>8</prism:volume>
    <prism:startingPage>140</prism:startingPage>
    <prism:category>alignment</prism:category>
    <prism:category>methods</prism:category>
</item>



</rdf:RDF>

