AMOS WIKI - User contributions [en]

Bambus 2.0/goBambus-perl

2011-03-02T11:23:04Z

Dmb000006: meh

:<small>< [[Bambus 2.0]]</small>
<PRE>
#!/usr/bin/perl -w

## Very simple perl script to drive the Bambus 2.0 pipeline (as I
## currently understand it).

use strict;
use Getopt::Long;

## Configure defaults

my $verbose = 1;

my $contig_file = '';
my $fasta_file = '';
my $mates_file = '';

my $output_prefix = 'out';

## The threshold used to accept or reject a link between contigs
my $link_redundancy = 1;

## Weather or not to run the (crappy) 'repeat filter' code
my $filter_repeats = 0;

## Not running dot saves time on 'big' runs
my $run_dot = 1;

## Process command line arguments

GetOptions
("contig_file=s" => \$contig_file,
"fasta_file=s" => \$fasta_file,
"mates_file=s" => \$mates_file,
"output_prefix=s" => \$output_prefix,

"link_redundancy|r=i" => \$link_redundancy,
"repeat_filter|x" => \$filter_repeats,
"dot|d!" => \$run_dot,
"verbose+" => \$verbose,
)
or die "failure to communicate\n";

die "-c contig file plz!\n" unless -s $contig_file;
die "-f fasta file plz!\n" unless -s $fasta_file;
die "-m mates file plz!\n" unless -s $mates_file;

die "are you crazy?\n"
if $output_prefix eq '';

warn
join("\n",
"contig file : $contig_file",
"fasta file : $fasta_file",
"mates file : $mates_file",
"output prefix : $output_prefix",
"link redundancy : $link_redundancy",
"repeat filter : $filter_repeats",
"run dot? : $run_dot",
"verbose : $verbose",
), "\n"
if $verbose > 0;
#exit;

## Run the pipeline

## Get data into bank format

run(qq/
toAmos
-s $fasta_file
-c $contig_file
-m $mates_file
-o $output_prefix.afg
/);

## Debugging mates file
#exit;

run(qq/
bank-transact -cf
-m $output_prefix.afg
-b $output_prefix.bnk
/);

## Run the new Bambus pipeline

run(qq/
clk
-b $output_prefix.bnk
/);

run(qq/
Bundler
-b $output_prefix.bnk
/);

## Repeat filtering?
my $filter_repeats_option_string = '';
if($filter_repeats){
run(qq/
MarkRepeats
-noCoverageRepeats
-b $output_prefix.bnk
-redundancy $link_redundancy
> $output_prefix.repeats
/);

$filter_repeats_option_string =
"-repeats $output_prefix.repeats";
}

run(qq/
OrientContigs
-noreduce
$filter_repeats_option_string
-b $output_prefix.bnk
-redundancy $link_redundancy
-prefix $output_prefix
/);

## Generate some additional TEXT output

## ouput a fasta sequence for the contigs from the bank (passed to
## printScaff with -f)

#run(qq/
# bank2fasta -iid
# -b $output_prefix.bnk
# > $output_prefix.contig.fasta
#/);

## Generates the useful .details, .oo, .sum and .stats files
# -f $output_prefix.contig.fasta
run(qq/
printScaff -detail -oo -sum
-e $output_prefix.evidence.xml
-s $output_prefix.out.xml
-l $output_prefix.library
-o $output_prefix
/);

run(qq/
mv -f
printScaff.error
$output_prefix.printScaff.error
/)
if -s 'printScaff.error';

## UNTANGLE

run(qq/
untangle
-e $output_prefix.evidence.xml
-s $output_prefix.out.xml
-o $output_prefix.out.untangle.xml
/);

run(qq/
mv -f
untangle.error
$output_prefix.untangle.error
/)
if -s 'untangle.error';

## Generates the useful .details, .oo, .sum and .stats files
# -f $output_prefix.contig.fasta
run(qq/
printScaff -detail -oo -sum -dot
-e $output_prefix.evidence.xml
-s $output_prefix.out.untangle.xml
-l $output_prefix.library
-o $output_prefix.untangle
/);

run(qq/
mv -f
printScaff.error
$output_prefix.untangle.printScaff.error
/)
if -s 'printScaff.error';

## FINALLY, DOT

if($run_dot){
# output before untangle
run(qq/
dot -Tps
$output_prefix.dot
> $output_prefix.ps
/);

# output after untangle
run(qq/
dot -Tps
$output_prefix.untangle.dot
> $output_prefix.untangle.ps
/);
}

warn "OK\n";

## Yup

sub run{
my $cmd = shift;

$cmd =~ s/\n/ /g;

print "\n\n\nRUN: $cmd\n\n"
if $verbose > 0;

`$cmd`;

die if $? != 0;

return 1;
}
</PRE>

Bambus 2.0/goBambus-perl

2011-03-02T11:05:22Z

Dmb000006: Yup

<PRE>
#!/usr/bin/perl -w

## Very simple perl script to drive the Bambus 2.0 pipeline (as I
## currently understand it).

use strict;
use Getopt::Long;

## Configure defaults

my $verbose = 1;

my $contig_file = '';
my $fasta_file = '';
my $mates_file = '';

my $output_prefix = 'out';

## The threshold used to accept or reject a link between contigs
my $link_redundancy = 1;

## Weather or not to run the (crappy) 'repeat filter' code
my $filter_repeats = 0;

## Not running dot saves time on 'big' runs
my $run_dot = 1;

## Process command line arguments

GetOptions
("contig_file=s" => \$contig_file,
"fasta_file=s" => \$fasta_file,
"mates_file=s" => \$mates_file,
"output_prefix=s" => \$output_prefix,

"link_redundancy|r=i" => \$link_redundancy,
"repeat_filter|x" => \$filter_repeats,
"dot|d!" => \$run_dot,
"verbose+" => \$verbose,
)
or die "failure to communicate\n";

die "-c contig file plz!\n" unless -s $contig_file;
die "-f fasta file plz!\n" unless -s $fasta_file;
die "-m mates file plz!\n" unless -s $mates_file;

die "are you crazy?\n"
if $output_prefix eq '';

warn
join("\n",
"contig file : $contig_file",
"fasta file : $fasta_file",
"mates file : $mates_file",
"output prefix : $output_prefix",
"link redundancy : $link_redundancy",
"repeat filter : $filter_repeats",
"run dot? : $run_dot",
"verbose : $verbose",
), "\n"
if $verbose > 0;
#exit;

## Run the pipeline

## Get data into bank format

run(qq/
toAmos
-s $fasta_file
-c $contig_file
-m $mates_file
-o $output_prefix.afg
/);

## Debugging mates file
#exit;

run(qq/
bank-transact -cf
-m $output_prefix.afg
-b $output_prefix.bnk
/);

## Run the new Bambus pipeline

run(qq/
clk
-b $output_prefix.bnk
/);

run(qq/
Bundler
-b $output_prefix.bnk
/);

## Repeat filtering?
my $filter_repeats_option_string = '';
if($filter_repeats){
run(qq/
MarkRepeats
-noCoverageRepeats
-b $output_prefix.bnk
-redundancy $link_redundancy
> $output_prefix.repeats
/);

$filter_repeats_option_string =
"-repeats $output_prefix.repeats";
}

run(qq/
OrientContigs
-noreduce
$filter_repeats_option_string
-b $output_prefix.bnk
-redundancy $link_redundancy
-prefix $output_prefix
/);

## Generate some additional TEXT output

## ouput a fasta sequence for the contigs from the bank (passed to
## printScaff with -f)

#run(qq/
# bank2fasta -iid
# -b $output_prefix.bnk
# > $output_prefix.contig.fasta
#/);

## Generates the useful .details, .oo, .sum and .stats files
# -f $output_prefix.contig.fasta
run(qq/
printScaff -detail -oo -sum
-e $output_prefix.evidence.xml
-s $output_prefix.out.xml
-l $output_prefix.library
-o $output_prefix
/);

run(qq/
mv -f
printScaff.error
$output_prefix.printScaff.error
/)
if -s 'printScaff.error';

## UNTANGLE

run(qq/
untangle
-e $output_prefix.evidence.xml
-s $output_prefix.out.xml
-o $output_prefix.out.untangle.xml
/);

run(qq/
mv -f
untangle.error
$output_prefix.untangle.error
/)
if -s 'untangle.error';

## Generates the useful .details, .oo, .sum and .stats files
# -f $output_prefix.contig.fasta
run(qq/
printScaff -detail -oo -sum -dot
-e $output_prefix.evidence.xml
-s $output_prefix.out.untangle.xml
-l $output_prefix.library
-o $output_prefix.untangle
/);

run(qq/
mv -f
printScaff.error
$output_prefix.untangle.printScaff.error
/)
if -s 'printScaff.error';

## FINALLY, DOT

if($run_dot){
# output before untangle
run(qq/
dot -Tps
$output_prefix.dot
> $output_prefix.ps
/);

# output after untangle
run(qq/
dot -Tps
$output_prefix.untangle.dot
> $output_prefix.untangle.ps
/);
}

warn "OK\n";

## Yup

sub run{
my $cmd = shift;

$cmd =~ s/\n/ /g;

print "\n\n\nRUN: $cmd\n\n"
if $verbose > 0;

`$cmd`;

die if $? != 0;

return 1;
}
</PRE>

Bambus2

2011-03-02T11:04:09Z

Dmb000006:

[http://www.cs.umd.edu/~sergek/ Sergey Koren] and
[http://www.cbcb.umd.edu/~mpop/ Mihai Pop]

Scaffolding represents the task of ordering and orienting contigs by incorporating additional information about their relative placement along the genome. The original Bambus package was the first general purpose scaffolders made available as an open source package. We are happy to announce the arrival of Bambus 2.0, the second generation Bambus scaffolder available as an open source package. While most other scaffolders are closely tied to a specific assembly program, Bambus accepts the output from most current assemblers and provides the user with great flexibility in choosing the scaffolding parameters. In particular, Bambus is able to accept contig linking data other than specified by mate-pairs. Such sources of information include alignment to a reference genome (Bambus can directly use the output of MUMmer), physical mapping data, or information about gene synteny.

Getting data into Bambus 2 requires you convert your assembly to AMOS format. Here is my recipe:

[[toAmos]] \
-s my.fa \
-c my.contig \
-m my.mates \
-o my.afg

You need the .fa to list the contigs within the GFD-like contig file (annoying but true). You don't need accurate sequences in the .fa, you just need something to make the format valid. The .contig and .mates are as expected for [[Bambus]].

The resulting .afg is then 'banked' with:

[[bank-transact]] -c \
-b my.bnk \
-m my.afg

For more details, see the info here: http://www.cbcb.umd.edu/software/bambus/, which I have reproduced here [[Bambus 2.0/quick start guide]].

A Perl flavour Bambus 2.0 'driver' script can be found here: [[Bambus 2.0/goBambus-perl]].

Bambus

2011-03-02T11:03:11Z

Dmb000006: See it!

{| align="right"
| __TOC__
|}

{| align="left"
| [[Image:BambusLogo.jpg]]
|}

Bambus is the first general purpose scaffolder that is publicly available as an open source package. While most other scaffolders are closely tied to a specific assembly program, Bambus accepts the output from most current assemblers and provides the user with great flexibility in choosing the scaffolding parameters. In particular, Bambus is able to accept contig linking data other than specified by mate-pairs. Such sources of information include alignment to a reference genome (Bambus can directly use the output of MUMmer), physical mapping data, or information about gene synteny.

Note: Bambus is undergoing a transition in order to be integrated with the AMOS package. Please stay tuned for a new and improved release! See '''[[Bambus2]]'''.

== Documentation ==
The distribution includes detailed documentation of all the file formats used. Also see the [[Bambus Manual]].

In addition to the simple test data provided in the source package you can download a more complex example from:
[ftp://ftp.cbcb.umd.edu/pub/data/assembly/bambus-data.tar.gz ftp://ftp.cbcb.umd.edu/pub/data/assembly/bambus-data.tar.gz].

== Publication ==
"Hierarchical scaffolding with Bambus." Pop M, Kosack DS, Salzberg SL, Genome Research, 2004. 14(1):149-59.

== Acknowledgements ==
The development of BAMBUS was supported by the National Science Foundation under grant KDI-9980088.

[[Category:Bambus]]

ToAmos

2011-02-28T16:10:44Z

Dmb000006: /* Errors */

toAmos: converter from various types of inputs to AMOS messages

== Overview ==

toAmos is primarily designed for converting the output of an assembly program into the AMOS format so that it can be stored in an AMOS bank. toAmos can be used as a replacement for tarchive2amos however the latter is more flexible when converting from Trace Archive or simple .seq and .qual inputs.

== Synopsis ==

toAmos -o out_file
(-s fasta_reads (-q qual_file) (-gq good_qual) (-bq bad_qual))
(-c tigr_contig | -a celera_asm [-S][-utg] | -ta tigr_asm | -ace phrap_ace [-phd])
(-m bambus_mates | -x trace_xml | -f celera_frg [-acc])
(-arachne arachne_links | -scaff bambus_scaff)
(-i insert_file | -map dst_map)
(-pos pos_file)
(-id min_id)

toAmos reads the inputs specified on the command line and converts the information into AMOS message format. The following types of information can be provided to toAmos:

* Sequence and quality data (options -f, -s, -q, -gq, or -bq)
* Library and mate-pair data (options -m, -x, -f, -i, or -map)
* Contig data (options -c, -a, -ta, or -ace)
* Scaffold data (option -a)

== Options ==
{| class="somecssclass" border="1"
|-
| -o <out_file> || output filename ('-' for standard output)
|-
| -s <fasta_reads> || sequence data file in FASTA format (reads names ending in .1 or /1 are taken as mate pairs)
|-
| -q <qual_file> || sequence quality score file in QUAL format
|-
| -gq <bad_qual> || minimum quality score for high-quality bases (default: 30) - if no quality file provided bases within clear range are assigned this quality value
|-
| -bq <good_qual> || maximum quality score for low-quality bases (default: 10) - if no quality file provided bases outside the clear range are assigned this quality value (default 10)
|-
| -c <tigr_contig> || provide TIGR .contig file [http://www.cbcb.umd.edu/research/contig_representation.shtml in GDE-like format]
|-
| -a <celera_asm> || use Celera Assembler .asm contig file (contig and scaffold information)
|-
| -S || include the surrogate unitigs in the .asm file as AMOS contigs
|-
| -utg || include all UTG unitig messages in the .asm file as AMOS contigs
|-
| -ta <tigr_asm> || contig file in TIGR Assembler format (.tasm)
|-
| -ace <phrap_ace> || contig file in Phred ACE format (can be accompanied by -q)
|-
| -phd || read the content of PHD file referenced in ACE files
|-
| -m <bambus_mates> || library and mate-pair information file in Bambus format
|-
| -x <trace_xml> || ancilliary data file (library, mate-pair, clear range) in Trace Archive XML format
|-
| -f <celera_frg> || library, mate-pair, sequence, quality, and clear range data file in Celera Assembler format
|-
| -acc || use accession numbers in FRG files
|-
| -arachne <arachne_links> || scaffold file in Arachne .links format
|-
| -scaff <bambus_scaff> || scaffold file in Bambus .scaff format
|-
| -map <dst_map> || read map information - mapping from internal library ID to external library ID useful in conjunction with the -f option. This file consists of space-separated records providing a mapping from the "acc:" field in "DST" records within the .frg file to an externally recognizable name for each library.
|-
| -pos <pos_file> || TIGR-style .pos position file
|-
| -id <min_id> || start numbering contigs at this number
|-
|}

== TIGR specific options (not too useful outside TIGR) ==

* -i <insert file> - use mapping from internal library ID to external library ID provided in a .insert file produced by pullfrag.

== Known issues ==

The -ta (TIGR Assembler input) and -ace (ACE formatted input) options have not been throughly tested and likely do not properly work. Contact us if either of these options is important to you.

== Errors ==
toAmos -c my.test.contig -m my.test.mates -o my.test.afg
Cannot find ID for sequence lid05.f

This problem is caused by forgetting to pass a fasta file (with -s) for the read sequences in the contig file. This is a bit weird as the reads are already in the .contig file.

ABBA

2011-01-21T09:40:51Z

Dmb000006: /* Acknowledgements */ hyperlinked grant id's to look for outcomes.

ABBA: Assembly Boosted By Amino acid sequences

== Overview ==

Assembly Boosted By Amino acid sequence is a comparative gene assembler, which uses amino acid sequences from predicted proteins to help build a better assembly. See the journal paper.

For additional information on short read assembly check the following University of Maryland CBCB web sites:

* [http://www.cbcb.umd.edu/research/SR-assembly.shtml Genome Assembly with Short Reads]
* [http://www.cbcb.umd.edu/research/SR-assembly-tutorial.shtml Genome Assembly with Short Reads Tutorial]

== Documentation ==

Documentation on running ABBA is included with the distribution in the /docs subdirectory of AMOS.

== References ==

[http://www.ploscompbiol.org/article/info:doi/10.1371/journal.pcbi.1000186 Gene-Boosted Assembly of a Novel Bacterial Genome from Very Short Reads].

Salzberg SL, Sommer DD, Puiu D, Lee VT 2008 PLoS Computational Biology 4(9): e1000186 doi:10.1371/journal.pcbi.1000186

== Acknowledgements ==
The development of ABBA was supported by the National Institutes of Health under grants [http://www.google.com/search?q=R01-LM06845 R01-LM06845] and [http://www.google.com/search?q=R01-LM007938 R01-LM007938] to SLS.

PrintScaff

2011-01-07T15:02:46Z

Dmb000006: Redirected page to Bambus/printScaff

#REDIRECT [[Bambus/printScaff]]

Bambus Manual

2011-01-07T15:02:20Z

Dmb000006: /* Untangling scaffolds */

{| align="right"
| __TOC__
|}

The output of most shotgun sequence assembly programs (such as TIGR Assembler, phrap, or CAP3) consists in a set of un-related contigs, whose order and orientation along the chromosome is unknown. Scaffolding represents the task of ordering and orienting these contigs by using additional information about their relative placement. Traditionally such information was identified from the pairing of reads from the opposite ends of an insert in double-barelled shotgun experiments (see figure).

[[Image:scaffold image.gif]]

Besides clone mate information, homology data, physical maps, or gene synteny information can be used to derive relationships between contigs. BAMBUS is a program written to handle all such types of data in a generic fashion in order to build contig scaffolds. In order to account for the varied quality of linking information BAMBUS allows the user to specify a hierarchy that will consider the most reliable linking data first, then expand the scaffolds using less reliable data. This approach minimizes the effect of errors inherent to experimentally-derived linking information.

=== Algorithm overview ===

BAMBUS has two main modes of operation: hierarchical - when the links are considered in the order of their priorities, and standard - when all the links are considered at the same priority level. In the latter case, all links connecting two contigs are bundled together to form a "gap", or "edge" if you look at the problem from a graph theoretical point of view. Each link is checked for validity before being added to a bundle. Thus, a link is considered invalid due to a length constraint if it forces the two contigs to overlap. In other words, if we assume the largest possible size for the insert corresponding to the link, if the coordinates of the mate-pair within the contigs force the contigs to overlap, we discard the link assuming it is due to a misassembly (Note: this behaviour can be specified on a link-by-link basis). Links that pass this test are further checked for consistency. If they do not all agree in the relative orientations of the contigs they imply, a majority rule is used to retain only those links that agree both in orientation and length. Furthermore, the bundle, or edge, is retained if it contains at least two links (this parameter, redundancy, can actually be tuned by the user). After this step is complete, the resulting graph gets traversed twice, once to assign consistent orientations to all the contigs, and the second time to determine the order of the contigs along the chromosome. Note that inconsistencies in order are currently allowed since they can provide finishing teams with useful information. In a future version of BAMBUS we will add an option to allow the generation of unambiguous scaffolds as well. Currently you can generate such unambiguous scaffolds using the [[#Untangling scaffolds|untangle]] program.

== Installation ==

=== Prerequisites ===

* Install AMOS
* Perl 5.6 or later
* [http://www.cpan.org/ XML::Parser] perl module
* [http://www.cpan.org/ Config::IniFiles] perl module
* [http://www.research.att.com/sw/tools/graphviz/ GraphViz] package

=== Obtaining BAMBUS ===

BAMBUS 2.3 is available free of charge under the open-source Artistic License.

The Bambus source if freely available for download from the File Release Section of our SourceForge project page.

To receive information regarding new releases and developments, please subscribe to our moderated, low-traffic users' mailing list:

amos-users(at)lists(dot)sourceforge(dot)net

=== Actual installation ===

1. Find a directory where you want the software installed. For example /users/home/CoolUser

2. Place the tar file bambus-2.33.tar.gz in this directory

3. Unpack the distribution
% tar xvzf bambus-2.33.tar.gz

4. Go into the newly created directory
% cd bambus-2.33

5. Carefully read the documentation

6. Edit the file Makefile and change the value of BASEDIR to the correct installation path, for example:
BASEDIR = /users/home/CoolUser

7. Also check if the path for perl (in variable PERL) matches the one on your system. Several common options are:
PERL = /usr/local/bin/perl
PERL = /usr/bin/perl

8. Type 'gmake install' to have all the software installed. The executables will be in /users/home/CoolUser/bin/ and the documentation in /users/home/CoolUser/doc.

9. To get started you can try out the small test dataset provided in data/test_run by typing:

goBambus -c test.contig -m test.mates -o test-bambus

== Running BAMBUS ==

The input to Bambus consists in a set of links between contigs. These links can be inferred from the pairing of reads belonging to the same insert, from physical map data, or from alignment to another genome. To accomodate these types of sources of linking information, and to allow for further extensions, the input to Bambus is presented in a general purpose XML format.

The following sections describe how the XML file can be generated for each type of linking data. For more information see [[#.evidence.xml|.evidence.xml]].

Linking information is grouped into "libraries". All links within a library have similar parameters. In the case of mate-pair linking information the libraries match the standard definition. For other linking data, for example MUMmer links, the library grouping reflects specific characteristics of the linking data. For example all MUMmer links are grouped within a library called "MUMmer".

=== Command line options ===

Bambus consists in a collection of programs controlled by a script called goBambus. The execution is controlled by the following sets of parameters. To obtain a list of all parameters and basic help information you need to type:

goBambus -h
or
goBambus -help

==== Configuration options ====

* -C <conf_file> . This parameter specifies the configuration file used by Bambus. A configuration file is not required, however it is useful if you want to change the default parameters. See [[#The configuration file|the configuration file]] for a detailed description of the configuration information.

Whenever you run Bambus it will generate a default configuration file called default.conf. If you want to modify the parameters, you will need to rename this file and then edit it.

==== Input Options ====

* -a <asm_file> . Use a TIGR Assembler .asm file as source of linking information.
* -c <contig_file> . Use a GDE formatted .contig assembly file as source of linking information. The .contig files can be obtained either from TIGR Assembler output (by concatenating the contents of the .align directory) or by converting .ace files (created by phrap or Consed) using the [[ace2contig]] package.
* -x <xml_file>. Use additional XML formatted linking information. You can use the -x option multiple times. See [[#.evidence.xml|.evidence.xml]] for the XML file format.
* -mx <make_file>. Use a gmake compatible make file to specify how the additional XML information is generated. Each of the files created must end in ".xml". Moreover, only the final XML targets are allowed to end in ".xml" to prevent goBambus from getting confused. See [[#Using Makefiles|using Makefiles]] for a more detailed description on using makefiles.

==== Mate-pair Information ====

Assembly output (such as .asm and .contig files) does not generally contain any information about the pairing of reads coming from opposite ends of the same insert, nor about the membership of inserts to libraries. The following parameters specify how to obtain this mate-pair information.

* -m <mates_file> . The mates file allows you to specify both library information and mate pairing information, using a system based on Perl regular expressions. See [[#The .mates file|the .mates file]].
* -D <database> . At TIGR, instead of a mates file you can opt to use a database to obtain library and mate information. Note that the -m and -D options are complementary. The following three parameters specify database options:
* -S <server> . Select a specific database server
* -U <user> . Log in with the specified user name
* -P <passwd> . Log in with the specified password

==== Output Options ====

* -o <output_prefix> . All the output file names will be generated from the <output_prefix> prefix.

==== Repeat Screening ====

Misassembled repeats can confuse Bambus, therefore it makes sense to screen the known repeats that may be misassembled. Bambus can use two types of repeat screening information:

* -r . Screen against the repeats listed in the database (option -D must be provided)
* -r <repfile> . Screen against repeats described in [[#Repeat files|repeat files]].

==== Flow-control ====

* -start <num>
* -end <num>. The execution of Bambus consists in a set of steps. The user can choose to start or end at a specific step for example in the case when she modifies the configuration file and wants to re-run just the final steps of the scaffolders. This feature should be used with caution.

=== Typical use cases ===
==== Using assembler output ====

For convenience, Bambus can use the output of an assembly program and automatically generate the required linking information. Two types of information are required:

* the tiling of reads in the assembly
* information about the pairing of reads

The first type of information can be obtained from the .asm or .contig output formats of TIGR Assembler. For more information about these file formats please refer to the [http://www.jcvi.org/cms/publications/listing/abstract/article/tigr-assembler-a-new-tool-for-assembling-large-shotgun-sequencing-projects/ TIGR Assembler documentation]. Note that the .contig file may be stripped of all sequence data as the only information used is that contained in the lines starting with #. In case you are using an assembler other than TIGR Assembler you may need to use the ta2ace package to convert the output into the .contig format.

The second type of information can be obtained from a [[#The .mates file|.mates file]], or from the database in case you are at TIGR.

A typical command line for someone outside of TIGR is:

goBambus -c test.contig -m test.mates -o test

and the execution will generate the following files:

* [[#.stats file|test.stats]] - statistics on scaffolds and libraries
* [[#.details file|test.details]] - detailed information about each contig pair (also called "gap").
* [[#.dot file|test.dot]] - GraphViz formatted description of the contig linkage information.
* [[#.evidence.xml|test.evidence.xml]] - XML representation of all the linking evidence provided.
* [[#.out.xml|test.out.xml]] - XML representation of the scaffolds.
* [[#.lib file|test.lib]] - list of the codes associated with each input library. These codes are reported on the links in the output file. 
* [[#.sum file|test.sum]] - one-line sumaries of all scaffolds.
* [[#.oo file|test.oo]] - order and orientation information for all the contigs.

Additionally, the program generates some "working" files that are currently not removed as they provide useful debugging information.

* test.detective.xml - linking information derived from assembler input alone. It's the same as test.evidence.xml unless additional XML files are provided in the input.
* test.inp - input to the core scaffolding engine.
* test.grommit.conf - configuration information for the scaffolding engine

To view the graphical output you need to convert it to Postscript with the command:

dot -Tps -o test.ps test.dot

Then you can view the postscript file with the command:

gv test.ps

At TIGR, instead of specifying a .mates file you can use the database:

goBambus -c test.contig -D gbx -U access -P access -o test

==== Getting more (or less) information from the output ====

You can modify the information you get in the output by directly calling the [[Bambus/printScaff|printScaff]] command.

==== Adding additional XML linking information ====

You can easily add additional linking information as long as it's in a format similar to the [[#.evidence.xml file|.evidence.xml file]]. Assuming we have two such files "link1.xml" and "link2.xml" you can call Bambus as follows:

goBambus -c test.contig -m test.mates -x link1.xml -x link2.xml -o test

The result being the same as if you concatenated the XML file inferred from the .contig file together with the two additional XML files.

==== Using MUMmer links ====

In case you are sequencing a genome for which a close relative has been completed, you can use [[MUMmer]] to infer links between contigs. Assume you have the reference genome in a file "ref.fasta" and all the contigs from the genome you are assembling in a file called "test.fasta", you can run MUMmer to align the test genome to the reference:

nucmer -maxmatch ref.fasta test.fasta

The output will be placed in a file called out.delta. You can then convert the alignment information into an XML file suitable for Bambus as follows:

show-tiling -x out.delta > test.mum.xml

And finally you can provide this information to Bambus:

goBambus -c test.contig -m test.mates -x test.mum.xml -o test

==== Using Makefiles ====

If you have multiple XML files, it becomes tedious to generate each file separately and then provide them to Bambus with multiple -x options. You can create a Makefile compatible with GNU make that generates each of the XML files. You must be careful that the targets for all the XML files end in ".xml", and no other targets (such as intermediate targets) have the same suffix. Bambus will assume that all .xml files produced by the Makefile must be added to the input.

An example of a simple Makefile that can generate the mummer information described above is:

test.mum.xml: ref.fasta test.fasta
nucmer -a max-match ref.fasta test.fasta
show-tiling -x out.delta > test.mum.xml

==== Untangling scaffolds ====

By default, the scaffolds resulting from Bambus are potentially ambiguous as two or more contigs may occupy the same place in the genome. Such situations occur either due to misassembled repeats, or when assembling different haplotypes. Bambus contains a utility that does a best effort attempt to disambiguate the scaffolds, by breaking them into a set of non-ambiguous scaffolds. The algorithm is greedy and does not guarantee that an optimal solution (e.g. a minimum number of longest scaffolds) is produced.

To run the untangler type:

untangle -e test.evidence.xml -s test.out.xml -o test.untangle.xml

The test.untangle.xml file has the same format as the output file ([[#.out.xml|.out.xml]]) and for each scaffold (e.g. scaff_1) contains one or more unambiguous scaffolds called "scaff_1_1, scaff_1_2, etc". You can use the [[printScaff]] command to process the untangled file the same way you would process a normal Bambus output.

== File formats ==

=== The configuration file ===

Scaffolding parameters can be specified in a configuration file. Whenever you run Bambus a default configuration file is generated as default.conf. To create your own file you should rename this file then edit it. The following types of information can be specified:

* Comments. Lines starting with # are considered comments and are ignored by the software.

* Priority information. Specifies an order in which the libraries are considered. For example, the following line specifies priority 2 for all MUMmer links. The libraries are considered in the increasing order of priorities.

priority MUMmer 2

If no configuration file is provided, all libraries are assumed to have the same priority. If a configuration file is provided, only those libraries that have a priority record are used in scaffolding. In other words, if a library has no priority record associated with it, it will not be used in scaffolding.

A priority can also be specified for a specific redundancy parameter. For example, the following line specifies that those links between contigs supported by 4 or more sources of linking data should be processed at priority level 1.

priority redundancy 4 1

Note, however, that you need to also specify a set of libraries to be used at this priority level. A simple shortcut is specifying that all link classes should be processed at this redundancy.

priority ALL 1

If you are uncertain of which libraries your project uses, run Bambus without a configuration file. The libraries will be listed in a file ending in "[[#.lib file|.libs]]".

* Redundancy information. Specifies the minimum number of links required for a valid inter-contig link. You can specify this parameter as a global value:

redundancy 2

that is, all contigs must be linked by at least two links. You can also specify a per-library value like below which says a single MUMmer link is required to link two contigs.

redundancy MUMmer 1

* Minimum scaffold size. Specifies a scaffold size cutoff. For example thisimplies that only scaffolds that contain more than 10kbp are present in the output.:

mingroupsize 10000

* Link size error. For some link types, determining the exact link length can only be done within a certain error. This parameter specifies, as a percentage, the estimated error in size determination. For example, since MUMmer links make an assumption of evolutionary closeness between two genomes, the error associated with them should be proportional to the estimated evolutionary distance (as nucleotide % similarity) between the two genomes. For an error of 5% use this parameter:

error MUMmer 0.05

* Overlapping contigs allowed. For each library you can control the link validation process by specifying whether adjacent contigs are allowed to overlap or not. By default mate-pair links assume adjacent contigs do not overlap, while MUMmer links allow contigs to overlap. This is equivalent to the configuration parameters:

overlaps lib_1 N
overlaps MUMmer Y

An example is provided in [[bambus.conf|data/sample_files/bambus.conf]].

=== The .mates file ===

The .mates file provides two types of information: library data, and mate-pair relationships between reads.

Library data can be described in two formats:
library <name> <min_size> <max_size>

and
library <name> <min_size> <max_size> <regexp>

Both formats require you to name each library and to provide a size range for the inserts belonging to it. The second format allows you to also provide a Perl regular expression that describes the naming convention for reads belonging to the library. The part of the sequence name that represents the library name must be placed within parentheses. As an example, the regular expression for TIGR sequences (where the library is specified by the first 4 characters) is:
(....).*

Mate-pair relationships can also be described in two ways:
pair <regexp_forw> <regexp_rev>

or
<seq_forw_1> <seq_rev_1> <library_name>
<seq_forw_2> <seq_rev_2> <library_name>
<seq_forw_3> <seq_rev_3> <library_name>
...

The first format requires two regular expressions corresponding to the forward and reverse mates of an insert. Just like the library record, the portion of the name corresponding to the insert name must be placed within parentheses. Two reads that match the two regular expressions, and have the exact same section matched within the parentheses will be considered mates. The library will be determined from the regular expression associated with a "library" record. As an example, at TIGR the first 7 characters represent the insert, then are followed by an optional T or P and the primer name (F or R for Forward or Reverse). The corresponding regular expression is:
pair (.......)[TP]?F (.......)[TP]?R

The second format is simply a list of sequence name pairs each followed by the name of the library they belong to. The corresponding library records must be listed in the file before the pairing data.

'''Note that fields in the .mates file must be separated by TAB characters otherwise the program will report an error.'''

An example of a .mates file using the phred/phrap naming convention is provided in [[bambus.mates|data/sample_files/bambus.mates]].

=== Repeat files ===

The repeat files used in screening linking data conform to the output standards of the repeatFinder program. The file contains 5 TAB-delimited values: contig ID, repeat name, left and right coordinates within the contig, and contig class. Only the first 4 values are essential as far as Bambus is concerned. Here is an example of repeat records:

AC009139.7.3 RPT1A 14554 96820 1
AC026498.3.2 RPT1B 124208 206475 1

=== .evidence.xml ===

This provides an XML representation of the input file. Below is a description of the data presented in this file (see also an [[bambus.evidence.xml|example]]):

The overall flow of the XML file is:

<EVIDENCE>
<LIBRARY>
<INSERT>
<SEQUENCE/>
<SEQUENCE/>
</INSERT>
...
</LIBRARY>
...
<CONTIG>
<SEQUENCE/>
...
</CONTIG>
...
<LINK>
<CONTIG/>
<CONTIG/>
</LINK>
...
</EVIDENCE>

<EVIDENCE this is the parent tag - the whole document occurs within <EVIDENCE> and </EVIDENCE>
ID = "1" some identifier for the file
DATE = "12/15/03" date when file was created
PROJECT = "MyProject" verbose description of the project
PARAMETERS = "" parameters used in creating this file
>

<LIBRARY for each shotgun library you need a separate library tag.
ID = "lib_1" some identifier for the library
NAME = "short" verbose name for the library
MIN = "1200" minimum size of inserts in this library
MAX = "3500" maximum size of inserts in this library
>

<INSERT the insert concept links together the two reads obtained from opposite ends of a clone insert
ID="ins_1" generic identifier for the insert
NAME="GALBZ92" verbose name for the insert
>

<SEQUENCE each insert contains two sequence reads obtained from to the opposite ends
ID="seq_1" generic identifier for the sequence
NAME="GALBZ92TF" verbose name for the sequence
>

<CONTIG Each contig in the assembly needs to be represented in the file
ID="contig_1" contig identifier
NAME= "1" contig name (usually the same as the identifier)
LEN="12352" contig length in basepairs
>

<SEQUENCE each sequence in the contig is listed together with information about the position within the contig.
ID="seq_1" sequence identifier (must match one described in the library section)
ORI="BE" orientation of sequence: BE - forward, EB - reverse
ASM_LEND="0" coordinate of sequence's left end within the contig
ASM_REND="525" coordinate of sequence's right end within the contig
>

<LINK this is a generic link between two contigs (i.e. not inferred from mate-pair data)
ID = "link_1" link identifier
SIZE = "-800" size of gap between the two contigs
TYPE = "MUMmer" link type: links with the same type get grouped into a virtual "library" used in specifying priorities
>

<CONTIG each link contains two contigs
ID = "contig_1" contig identifier: must match one described in the contig section above
ORI = "EB" contig orientation: BE - forward, EB - reverse
> each contig record may contain free-form data specifying the evidence for linking: e.g. alignment data

=== .out.xml ===

Represents, together with the evidence file, a description of the layout of the contigs. For each scaffold, the order (given as a coordinate along a chromosome) and the orientation of each contig is presented, together with a list of all the links used to generate this layout. Some links are deemed invalid, being given a code of "LEN" in case the length was deemed incorrect, or "ORI" in case the link orientation was considered incorrect. This file is also specified by the [[bambus.dtd|DTD]] and an example ([[bambus.out.xml|data/sample_files/bambus.out.xml]]).

=== .stats file ===

Finally, BAMBUS outputs a summary of the scaffolds generated. This file is pretty much self-explanatory, as evidenced by this example ([[bambus.stats|data/sample_files/bambus.stats]]). Note that in the .stats file, the N50 sizes are computed with respect to the total span of the scaffolds unless a different genome size is specified in a file called genome.size.

=== .details file ===

The .details file contains detailed information about the linking information between adjacent contigs. The contigs' orientation, size, and coordinates are listed and then all the linking data grouped by validity and library. An example is provided in [[bambus.details|data/sample_files/bambus.details]].

=== .dot file ===

Represents a graphical representation of the scaffolds in GraphViz format. Please see the [http://www.research.att.com/sw/tools/graphviz/ AT&T GraphViz] website for more information on this file format.

=== .lib file ===
Lists the libraries and their internal ID's

=== .ps file ===

A postscript image generated from the GraphViz-formatted file. It can be obtained from the .dot file with the command:
dot -Tps -o prefix.ps prefix.dot

An example is given in the picture below.
[[Image:bambus-display.jpg]]

Each scaffold is placed in a box, labeled with some statistics on the scaffold size (number of contigs, number of bases and span). All the "gaps" - linking relationships between the contigs are represented as edges. Each edge is decorated with the number of links contributing to it, the number of links from each library type, and the number of links invalidated due to incorrect length (L) or orientation (O). Each contig's ID, size, and coordinates within the scaffold are also listed.

=== .oo file ===

This file contains a summary of the order and orientation of all contigs present in the data. Each scaffold starts with a FASTA-like header containing the identifier for the scaffold followed by the number of contigs, size and span. Within each scaffold, all contigs are listed in the scaffold order, followed by the string BE for those in the forward orientation and EB for those in the reverse orientation. An example file is shown in: [[bambus.oo|data/sample_files/bambus.oo]].

=== .sum file ===

This file contains one line for each scaffold. Each line contains the scaffold ID, number of contigs, scaffold size, and span. The four values are separated by TAB characters. See an example in [[bambus.sum|data/sample_files/bambus.sum]].

== Known problems ==
There is a small "off-by-one" error in computing contig coordinates. It should not really affect the usefulness of the output.

== Contact information ==

BAMBUS is currently provided AS-IS, in other words we do not provide any support for the software. We would, however, like to hear your comments and suggestions. For Bambus bug reports, support requests, or any other inquiries please browse our SourceForge project page or Email us at:

amos-help (at) lists (dot) sourceforge (dot) net

BAMBUS was written by Mihai Pop and Dan Kosack.

[[Category:Bambus]]

Category:Bambus

2011-01-07T15:01:19Z

Dmb000006: Created page with 'Pages about Bambus'

Pages about [[Bambus]]

Bambus

2011-01-07T15:00:57Z

Dmb000006:

{| align="right"
| __TOC__
|}

{| align="left"
| [[Image:BambusLogo.jpg]]
|}

Bambus is the first general purpose scaffolder that is publicly available as an open source package. While most other scaffolders are closely tied to a specific assembly program, Bambus accepts the output from most current assemblers and provides the user with great flexibility in choosing the scaffolding parameters. In particular, Bambus is able to accept contig linking data other than specified by mate-pairs. Such sources of information include alignment to a reference genome (Bambus can directly use the output of MUMmer), physical mapping data, or information about gene synteny.

Note: Bambus is undergoing a transition in order to be integrated with the AMOS package. Please stay tuned for a new and improved release!

== Documentation ==
The distribution includes detailed documentation of all the file formats used. Also see the [[Bambus Manual]].

In addition to the simple test data provided in the source package you can download a more complex example from:
[ftp://ftp.cbcb.umd.edu/pub/data/assembly/bambus-data.tar.gz ftp://ftp.cbcb.umd.edu/pub/data/assembly/bambus-data.tar.gz].

== Publication ==
"Hierarchical scaffolding with Bambus." Pop M, Kosack DS, Salzberg SL, Genome Research, 2004. 14(1):149-59.

== Acknowledgements ==
The development of BAMBUS was supported by the National Science Foundation under grant KDI-9980088.

[[Category:Bambus]]

Bambus Manual

2011-01-07T14:58:00Z

Dmb000006:

{| align="right"
| __TOC__
|}

The output of most shotgun sequence assembly programs (such as TIGR Assembler, phrap, or CAP3) consists in a set of un-related contigs, whose order and orientation along the chromosome is unknown. Scaffolding represents the task of ordering and orienting these contigs by using additional information about their relative placement. Traditionally such information was identified from the pairing of reads from the opposite ends of an insert in double-barelled shotgun experiments (see figure).

[[Image:scaffold image.gif]]

Besides clone mate information, homology data, physical maps, or gene synteny information can be used to derive relationships between contigs. BAMBUS is a program written to handle all such types of data in a generic fashion in order to build contig scaffolds. In order to account for the varied quality of linking information BAMBUS allows the user to specify a hierarchy that will consider the most reliable linking data first, then expand the scaffolds using less reliable data. This approach minimizes the effect of errors inherent to experimentally-derived linking information.

=== Algorithm overview ===

BAMBUS has two main modes of operation: hierarchical - when the links are considered in the order of their priorities, and standard - when all the links are considered at the same priority level. In the latter case, all links connecting two contigs are bundled together to form a "gap", or "edge" if you look at the problem from a graph theoretical point of view. Each link is checked for validity before being added to a bundle. Thus, a link is considered invalid due to a length constraint if it forces the two contigs to overlap. In other words, if we assume the largest possible size for the insert corresponding to the link, if the coordinates of the mate-pair within the contigs force the contigs to overlap, we discard the link assuming it is due to a misassembly (Note: this behaviour can be specified on a link-by-link basis). Links that pass this test are further checked for consistency. If they do not all agree in the relative orientations of the contigs they imply, a majority rule is used to retain only those links that agree both in orientation and length. Furthermore, the bundle, or edge, is retained if it contains at least two links (this parameter, redundancy, can actually be tuned by the user). After this step is complete, the resulting graph gets traversed twice, once to assign consistent orientations to all the contigs, and the second time to determine the order of the contigs along the chromosome. Note that inconsistencies in order are currently allowed since they can provide finishing teams with useful information. In a future version of BAMBUS we will add an option to allow the generation of unambiguous scaffolds as well. Currently you can generate such unambiguous scaffolds using the [[#Untangling scaffolds|untangle]] program.

== Installation ==

=== Prerequisites ===

* Install AMOS
* Perl 5.6 or later
* [http://www.cpan.org/ XML::Parser] perl module
* [http://www.cpan.org/ Config::IniFiles] perl module
* [http://www.research.att.com/sw/tools/graphviz/ GraphViz] package

=== Obtaining BAMBUS ===

BAMBUS 2.3 is available free of charge under the open-source Artistic License.

The Bambus source if freely available for download from the File Release Section of our SourceForge project page.

To receive information regarding new releases and developments, please subscribe to our moderated, low-traffic users' mailing list:

amos-users(at)lists(dot)sourceforge(dot)net

=== Actual installation ===

1. Find a directory where you want the software installed. For example /users/home/CoolUser

2. Place the tar file bambus-2.33.tar.gz in this directory

3. Unpack the distribution
% tar xvzf bambus-2.33.tar.gz

4. Go into the newly created directory
% cd bambus-2.33

5. Carefully read the documentation

6. Edit the file Makefile and change the value of BASEDIR to the correct installation path, for example:
BASEDIR = /users/home/CoolUser

7. Also check if the path for perl (in variable PERL) matches the one on your system. Several common options are:
PERL = /usr/local/bin/perl
PERL = /usr/bin/perl

8. Type 'gmake install' to have all the software installed. The executables will be in /users/home/CoolUser/bin/ and the documentation in /users/home/CoolUser/doc.

9. To get started you can try out the small test dataset provided in data/test_run by typing:

goBambus -c test.contig -m test.mates -o test-bambus

== Running BAMBUS ==

The input to Bambus consists in a set of links between contigs. These links can be inferred from the pairing of reads belonging to the same insert, from physical map data, or from alignment to another genome. To accomodate these types of sources of linking information, and to allow for further extensions, the input to Bambus is presented in a general purpose XML format.

The following sections describe how the XML file can be generated for each type of linking data. For more information see [[#.evidence.xml|.evidence.xml]].

Linking information is grouped into "libraries". All links within a library have similar parameters. In the case of mate-pair linking information the libraries match the standard definition. For other linking data, for example MUMmer links, the library grouping reflects specific characteristics of the linking data. For example all MUMmer links are grouped within a library called "MUMmer".

=== Command line options ===

Bambus consists in a collection of programs controlled by a script called goBambus. The execution is controlled by the following sets of parameters. To obtain a list of all parameters and basic help information you need to type:

goBambus -h
or
goBambus -help

==== Configuration options ====

* -C <conf_file> . This parameter specifies the configuration file used by Bambus. A configuration file is not required, however it is useful if you want to change the default parameters. See [[#The configuration file|the configuration file]] for a detailed description of the configuration information.

Whenever you run Bambus it will generate a default configuration file called default.conf. If you want to modify the parameters, you will need to rename this file and then edit it.

==== Input Options ====

* -a <asm_file> . Use a TIGR Assembler .asm file as source of linking information.
* -c <contig_file> . Use a GDE formatted .contig assembly file as source of linking information. The .contig files can be obtained either from TIGR Assembler output (by concatenating the contents of the .align directory) or by converting .ace files (created by phrap or Consed) using the [[ace2contig]] package.
* -x <xml_file>. Use additional XML formatted linking information. You can use the -x option multiple times. See [[#.evidence.xml|.evidence.xml]] for the XML file format.
* -mx <make_file>. Use a gmake compatible make file to specify how the additional XML information is generated. Each of the files created must end in ".xml". Moreover, only the final XML targets are allowed to end in ".xml" to prevent goBambus from getting confused. See [[#Using Makefiles|using Makefiles]] for a more detailed description on using makefiles.

==== Mate-pair Information ====

Assembly output (such as .asm and .contig files) does not generally contain any information about the pairing of reads coming from opposite ends of the same insert, nor about the membership of inserts to libraries. The following parameters specify how to obtain this mate-pair information.

* -m <mates_file> . The mates file allows you to specify both library information and mate pairing information, using a system based on Perl regular expressions. See [[#The .mates file|the .mates file]].
* -D <database> . At TIGR, instead of a mates file you can opt to use a database to obtain library and mate information. Note that the -m and -D options are complementary. The following three parameters specify database options:
* -S <server> . Select a specific database server
* -U <user> . Log in with the specified user name
* -P <passwd> . Log in with the specified password

==== Output Options ====

* -o <output_prefix> . All the output file names will be generated from the <output_prefix> prefix.

==== Repeat Screening ====

Misassembled repeats can confuse Bambus, therefore it makes sense to screen the known repeats that may be misassembled. Bambus can use two types of repeat screening information:

* -r . Screen against the repeats listed in the database (option -D must be provided)
* -r <repfile> . Screen against repeats described in [[#Repeat files|repeat files]].

==== Flow-control ====

* -start <num>
* -end <num>. The execution of Bambus consists in a set of steps. The user can choose to start or end at a specific step for example in the case when she modifies the configuration file and wants to re-run just the final steps of the scaffolders. This feature should be used with caution.

=== Typical use cases ===
==== Using assembler output ====

For convenience, Bambus can use the output of an assembly program and automatically generate the required linking information. Two types of information are required:

* the tiling of reads in the assembly
* information about the pairing of reads

The first type of information can be obtained from the .asm or .contig output formats of TIGR Assembler. For more information about these file formats please refer to the [http://www.jcvi.org/cms/publications/listing/abstract/article/tigr-assembler-a-new-tool-for-assembling-large-shotgun-sequencing-projects/ TIGR Assembler documentation]. Note that the .contig file may be stripped of all sequence data as the only information used is that contained in the lines starting with #. In case you are using an assembler other than TIGR Assembler you may need to use the ta2ace package to convert the output into the .contig format.

The second type of information can be obtained from a [[#The .mates file|.mates file]], or from the database in case you are at TIGR.

A typical command line for someone outside of TIGR is:

goBambus -c test.contig -m test.mates -o test

and the execution will generate the following files:

* [[#.stats file|test.stats]] - statistics on scaffolds and libraries
* [[#.details file|test.details]] - detailed information about each contig pair (also called "gap").
* [[#.dot file|test.dot]] - GraphViz formatted description of the contig linkage information.
* [[#.evidence.xml|test.evidence.xml]] - XML representation of all the linking evidence provided.
* [[#.out.xml|test.out.xml]] - XML representation of the scaffolds.
* [[#.lib file|test.lib]] - list of the codes associated with each input library. These codes are reported on the links in the output file. 
* [[#.sum file|test.sum]] - one-line sumaries of all scaffolds.
* [[#.oo file|test.oo]] - order and orientation information for all the contigs.

Additionally, the program generates some "working" files that are currently not removed as they provide useful debugging information.

* test.detective.xml - linking information derived from assembler input alone. It's the same as test.evidence.xml unless additional XML files are provided in the input.
* test.inp - input to the core scaffolding engine.
* test.grommit.conf - configuration information for the scaffolding engine

To view the graphical output you need to convert it to Postscript with the command:

dot -Tps -o test.ps test.dot

Then you can view the postscript file with the command:

gv test.ps

At TIGR, instead of specifying a .mates file you can use the database:

goBambus -c test.contig -D gbx -U access -P access -o test

==== Getting more (or less) information from the output ====

You can modify the information you get in the output by directly calling the [[Bambus/printScaff|printScaff]] command.

==== Adding additional XML linking information ====

You can easily add additional linking information as long as it's in a format similar to the [[#.evidence.xml file|.evidence.xml file]]. Assuming we have two such files "link1.xml" and "link2.xml" you can call Bambus as follows:

goBambus -c test.contig -m test.mates -x link1.xml -x link2.xml -o test

The result being the same as if you concatenated the XML file inferred from the .contig file together with the two additional XML files.

==== Using MUMmer links ====

In case you are sequencing a genome for which a close relative has been completed, you can use [[MUMmer]] to infer links between contigs. Assume you have the reference genome in a file "ref.fasta" and all the contigs from the genome you are assembling in a file called "test.fasta", you can run MUMmer to align the test genome to the reference:

nucmer -maxmatch ref.fasta test.fasta

The output will be placed in a file called out.delta. You can then convert the alignment information into an XML file suitable for Bambus as follows:

show-tiling -x out.delta > test.mum.xml

And finally you can provide this information to Bambus:

goBambus -c test.contig -m test.mates -x test.mum.xml -o test

==== Using Makefiles ====

If you have multiple XML files, it becomes tedious to generate each file separately and then provide them to Bambus with multiple -x options. You can create a Makefile compatible with GNU make that generates each of the XML files. You must be careful that the targets for all the XML files end in ".xml", and no other targets (such as intermediate targets) have the same suffix. Bambus will assume that all .xml files produced by the Makefile must be added to the input.

An example of a simple Makefile that can generate the mummer information described above is:

test.mum.xml: ref.fasta test.fasta
nucmer -a max-match ref.fasta test.fasta
show-tiling -x out.delta > test.mum.xml

==== Untangling scaffolds ====

By default, the scaffolds resulting from Bambus are potentially ambiguous as two or more contigs may occupy the same place in the genome. Such situations occur either due to misassembled repeats, or when assembling different haplotypes. Bambus contains a utility that does a best effort attempt to disambiguate the scaffolds, by breaking them into a set of non-ambiguous scaffolds. The algorithm is greedy and does not guarantee that an optimal solution (e.g. a minimum number of longest scaffolds) is produced.

To run the untangler type:

untangle -e test.evidence.xml -s test.out.xml -o test.untangle.xml

The test.untangle.xml file has the same format as the output file ([[#.out.xml|.out.xml]]) and for each scaffold (e.g. scaff_1) contains one or more unambiguous scaffolds called "scaff_1_1, scaff_1_2, etc". You can use the [[#Getting more (or less) information from the output|printScaff]] command to process the untangled file the same way you would process a normal Bambus output.

== File formats ==

=== The configuration file ===

Scaffolding parameters can be specified in a configuration file. Whenever you run Bambus a default configuration file is generated as default.conf. To create your own file you should rename this file then edit it. The following types of information can be specified:

* Comments. Lines starting with # are considered comments and are ignored by the software.

* Priority information. Specifies an order in which the libraries are considered. For example, the following line specifies priority 2 for all MUMmer links. The libraries are considered in the increasing order of priorities.

priority MUMmer 2

If no configuration file is provided, all libraries are assumed to have the same priority. If a configuration file is provided, only those libraries that have a priority record are used in scaffolding. In other words, if a library has no priority record associated with it, it will not be used in scaffolding.

A priority can also be specified for a specific redundancy parameter. For example, the following line specifies that those links between contigs supported by 4 or more sources of linking data should be processed at priority level 1.

priority redundancy 4 1

Note, however, that you need to also specify a set of libraries to be used at this priority level. A simple shortcut is specifying that all link classes should be processed at this redundancy.

priority ALL 1

If you are uncertain of which libraries your project uses, run Bambus without a configuration file. The libraries will be listed in a file ending in "[[#.lib file|.libs]]".

* Redundancy information. Specifies the minimum number of links required for a valid inter-contig link. You can specify this parameter as a global value:

redundancy 2

that is, all contigs must be linked by at least two links. You can also specify a per-library value like below which says a single MUMmer link is required to link two contigs.

redundancy MUMmer 1

* Minimum scaffold size. Specifies a scaffold size cutoff. For example thisimplies that only scaffolds that contain more than 10kbp are present in the output.:

mingroupsize 10000

* Link size error. For some link types, determining the exact link length can only be done within a certain error. This parameter specifies, as a percentage, the estimated error in size determination. For example, since MUMmer links make an assumption of evolutionary closeness between two genomes, the error associated with them should be proportional to the estimated evolutionary distance (as nucleotide % similarity) between the two genomes. For an error of 5% use this parameter:

error MUMmer 0.05

* Overlapping contigs allowed. For each library you can control the link validation process by specifying whether adjacent contigs are allowed to overlap or not. By default mate-pair links assume adjacent contigs do not overlap, while MUMmer links allow contigs to overlap. This is equivalent to the configuration parameters:

overlaps lib_1 N
overlaps MUMmer Y

An example is provided in [[bambus.conf|data/sample_files/bambus.conf]].

=== The .mates file ===

The .mates file provides two types of information: library data, and mate-pair relationships between reads.

Library data can be described in two formats:
library <name> <min_size> <max_size>

and
library <name> <min_size> <max_size> <regexp>

Both formats require you to name each library and to provide a size range for the inserts belonging to it. The second format allows you to also provide a Perl regular expression that describes the naming convention for reads belonging to the library. The part of the sequence name that represents the library name must be placed within parentheses. As an example, the regular expression for TIGR sequences (where the library is specified by the first 4 characters) is:
(....).*

Mate-pair relationships can also be described in two ways:
pair <regexp_forw> <regexp_rev>

or
<seq_forw_1> <seq_rev_1> <library_name>
<seq_forw_2> <seq_rev_2> <library_name>
<seq_forw_3> <seq_rev_3> <library_name>
...

The first format requires two regular expressions corresponding to the forward and reverse mates of an insert. Just like the library record, the portion of the name corresponding to the insert name must be placed within parentheses. Two reads that match the two regular expressions, and have the exact same section matched within the parentheses will be considered mates. The library will be determined from the regular expression associated with a "library" record. As an example, at TIGR the first 7 characters represent the insert, then are followed by an optional T or P and the primer name (F or R for Forward or Reverse). The corresponding regular expression is:
pair (.......)[TP]?F (.......)[TP]?R

The second format is simply a list of sequence name pairs each followed by the name of the library they belong to. The corresponding library records must be listed in the file before the pairing data.

'''Note that fields in the .mates file must be separated by TAB characters otherwise the program will report an error.'''

An example of a .mates file using the phred/phrap naming convention is provided in [[bambus.mates|data/sample_files/bambus.mates]].

=== Repeat files ===

The repeat files used in screening linking data conform to the output standards of the repeatFinder program. The file contains 5 TAB-delimited values: contig ID, repeat name, left and right coordinates within the contig, and contig class. Only the first 4 values are essential as far as Bambus is concerned. Here is an example of repeat records:

AC009139.7.3 RPT1A 14554 96820 1
AC026498.3.2 RPT1B 124208 206475 1

=== .evidence.xml ===

This provides an XML representation of the input file. Below is a description of the data presented in this file (see also an [[bambus.evidence.xml|example]]):

The overall flow of the XML file is:

<EVIDENCE>
<LIBRARY>
<INSERT>
<SEQUENCE/>
<SEQUENCE/>
</INSERT>
...
</LIBRARY>
...
<CONTIG>
<SEQUENCE/>
...
</CONTIG>
...
<LINK>
<CONTIG/>
<CONTIG/>
</LINK>
...
</EVIDENCE>

<EVIDENCE this is the parent tag - the whole document occurs within <EVIDENCE> and </EVIDENCE>
ID = "1" some identifier for the file
DATE = "12/15/03" date when file was created
PROJECT = "MyProject" verbose description of the project
PARAMETERS = "" parameters used in creating this file
>

<LIBRARY for each shotgun library you need a separate library tag.
ID = "lib_1" some identifier for the library
NAME = "short" verbose name for the library
MIN = "1200" minimum size of inserts in this library
MAX = "3500" maximum size of inserts in this library
>

<INSERT the insert concept links together the two reads obtained from opposite ends of a clone insert
ID="ins_1" generic identifier for the insert
NAME="GALBZ92" verbose name for the insert
>

<SEQUENCE each insert contains two sequence reads obtained from to the opposite ends
ID="seq_1" generic identifier for the sequence
NAME="GALBZ92TF" verbose name for the sequence
>

<CONTIG Each contig in the assembly needs to be represented in the file
ID="contig_1" contig identifier
NAME= "1" contig name (usually the same as the identifier)
LEN="12352" contig length in basepairs
>

<SEQUENCE each sequence in the contig is listed together with information about the position within the contig.
ID="seq_1" sequence identifier (must match one described in the library section)
ORI="BE" orientation of sequence: BE - forward, EB - reverse
ASM_LEND="0" coordinate of sequence's left end within the contig
ASM_REND="525" coordinate of sequence's right end within the contig
>

<LINK this is a generic link between two contigs (i.e. not inferred from mate-pair data)
ID = "link_1" link identifier
SIZE = "-800" size of gap between the two contigs
TYPE = "MUMmer" link type: links with the same type get grouped into a virtual "library" used in specifying priorities
>

<CONTIG each link contains two contigs
ID = "contig_1" contig identifier: must match one described in the contig section above
ORI = "EB" contig orientation: BE - forward, EB - reverse
> each contig record may contain free-form data specifying the evidence for linking: e.g. alignment data

=== .out.xml ===

Represents, together with the evidence file, a description of the layout of the contigs. For each scaffold, the order (given as a coordinate along a chromosome) and the orientation of each contig is presented, together with a list of all the links used to generate this layout. Some links are deemed invalid, being given a code of "LEN" in case the length was deemed incorrect, or "ORI" in case the link orientation was considered incorrect. This file is also specified by the [[bambus.dtd|DTD]] and an example ([[bambus.out.xml|data/sample_files/bambus.out.xml]]).

=== .stats file ===

Finally, BAMBUS outputs a summary of the scaffolds generated. This file is pretty much self-explanatory, as evidenced by this example ([[bambus.stats|data/sample_files/bambus.stats]]). Note that in the .stats file, the N50 sizes are computed with respect to the total span of the scaffolds unless a different genome size is specified in a file called genome.size.

=== .details file ===

The .details file contains detailed information about the linking information between adjacent contigs. The contigs' orientation, size, and coordinates are listed and then all the linking data grouped by validity and library. An example is provided in [[bambus.details|data/sample_files/bambus.details]].

=== .dot file ===

Represents a graphical representation of the scaffolds in GraphViz format. Please see the [http://www.research.att.com/sw/tools/graphviz/ AT&T GraphViz] website for more information on this file format.

=== .lib file ===
Lists the libraries and their internal ID's

=== .ps file ===

A postscript image generated from the GraphViz-formatted file. It can be obtained from the .dot file with the command:
dot -Tps -o prefix.ps prefix.dot

An example is given in the picture below.
[[Image:bambus-display.jpg]]

Each scaffold is placed in a box, labeled with some statistics on the scaffold size (number of contigs, number of bases and span). All the "gaps" - linking relationships between the contigs are represented as edges. Each edge is decorated with the number of links contributing to it, the number of links from each library type, and the number of links invalidated due to incorrect length (L) or orientation (O). Each contig's ID, size, and coordinates within the scaffold are also listed.

=== .oo file ===

This file contains a summary of the order and orientation of all contigs present in the data. Each scaffold starts with a FASTA-like header containing the identifier for the scaffold followed by the number of contigs, size and span. Within each scaffold, all contigs are listed in the scaffold order, followed by the string BE for those in the forward orientation and EB for those in the reverse orientation. An example file is shown in: [[bambus.oo|data/sample_files/bambus.oo]].

=== .sum file ===

This file contains one line for each scaffold. Each line contains the scaffold ID, number of contigs, scaffold size, and span. The four values are separated by TAB characters. See an example in [[bambus.sum|data/sample_files/bambus.sum]].

== Known problems ==
There is a small "off-by-one" error in computing contig coordinates. It should not really affect the usefulness of the output.

== Contact information ==

BAMBUS is currently provided AS-IS, in other words we do not provide any support for the software. We would, however, like to hear your comments and suggestions. For Bambus bug reports, support requests, or any other inquiries please browse our SourceForge project page or Email us at:

amos-help (at) lists (dot) sourceforge (dot) net

BAMBUS was written by Mihai Pop and Dan Kosack.

[[Category:Bambus]]

Bambus/printScaff

2011-01-07T14:57:34Z

Dmb000006:

The '''printScaff''' command is a part of the [[Bambus Manual|Bambus]] pipeline.

PrintScaff requires as inputs the [[Bambus Manual#.evidence.xml|.evidence.xml]] file, the [[Bambus Manual#.out.xml|.out.xml]] file and the [[Bambus Manual#.lib file|.lib]] files produce by Bambus, the minimal invocation being:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib

Optionally you may specify the output prefix with option -o (just like in the [[Bambus Manual|Bambus invocation]]). PrintScaff has parameters that control what gets reported, and in what format.

In the first category you have:
* -dot - produce a .dot file
* -detail - produce a .details file
* -oo - produce a [[Bambus Manual#.oo file|.oo]] file listing all the contigs in each scaffold
* -sum - produce a [[Bambus Manual#.sum file|.sum]] tab delimited list of scaffold stats (#contigs, size, and span)
* -f <fasta_file> - generate a pseudo-molecule for each scaffold using the contig sequences listed in <fasta_file>. Note that the contigs in the scaffold files are names "contig_<id>" while the contigs in the <fasta_file> must simply be called "<id>". Note that this option together with the default -merge option (see below) is meaningless unless you've [[Bambus Manual#Untangling scaffolds|untangled]] the scaffold.
* -phys - create a .phys file listing all "gaps" spanned only by the specified libraries:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib -phys MUMmer

will generate a file called test.phys that contains all contig pairs linked by nothing but MUMmer links.

In the second category you have:
* -page - produce .dot file formatted for printing on 8.5x11" paper
* -plot - produce .dot file formatted for printing on a plotter (36x48" paper)
* -unused - draw edges corresponding to unused links
* -merge - (default) when the -f option is given, produce a pseudo-molecule for each scaffold by adding 60 N characters between the contigs in the scaffold
* -nomerge - when the -f option is given, create a fasta file containing all contigs in each scaffold in the correct orientation

[[Category:Bambus]]

Bambus/printScaff

2011-01-07T14:54:28Z

Dmb000006: Fixing 'internal links' to point back at the bambus manual

The '''printScaff''' command is a part of the [[Bambus]] pipeline.

PrintScaff requires as inputs the [[Bambus Manual#.evidence.xml|.evidence.xml]] file, the [[Bambus Manual#.out.xml|.out.xml]] file and the [[Bambus Manual#.lib file|.lib]] files produce by Bambus, the minimal invocation being:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib

Optionally you may specify the output prefix with option -o (just like in the [[Bambus Manual|Bambus invocation]]). PrintScaff has parameters that control what gets reported, and in what format.

In the first category you have:
* -dot - produce a .dot file
* -detail - produce a .details file
* -oo - produce a [[Bambus Manual#.oo file|.oo]] file listing all the contigs in each scaffold
* -sum - produce a [[Bambus Manual#.sum file|.sum]] tab delimited list of scaffold stats (#contigs, size, and span)
* -f <fasta_file> - generate a pseudo-molecule for each scaffold using the contig sequences listed in <fasta_file>. Note that the contigs in the scaffold files are names "contig_<id>" while the contigs in the <fasta_file> must simply be called "<id>". Note that this option together with the default -merge option (see below) is meaningless unless you've [[Bambus Manual#Untangling scaffolds|untangled]] the scaffold.
* -phys - create a .phys file listing all "gaps" spanned only by the specified libraries:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib -phys MUMmer

will generate a file called test.phys that contains all contig pairs linked by nothing but MUMmer links.

In the second category you have:
* -page - produce .dot file formatted for printing on 8.5x11" paper
* -plot - produce .dot file formatted for printing on a plotter (36x48" paper)
* -unused - draw edges corresponding to unused links
* -merge - (default) when the -f option is given, produce a pseudo-molecule for each scaffold by adding 60 N characters between the contigs in the scaffold
* -nomerge - when the -f option is given, create a fasta file containing all contigs in each scaffold in the correct orientation

Bambus Manual

2011-01-07T14:49:54Z

Dmb000006: Making an entry for the .lib file and correcting a within page link to that entry.

{| align="right"
| __TOC__
|}

The output of most shotgun sequence assembly programs (such as TIGR Assembler, phrap, or CAP3) consists in a set of un-related contigs, whose order and orientation along the chromosome is unknown. Scaffolding represents the task of ordering and orienting these contigs by using additional information about their relative placement. Traditionally such information was identified from the pairing of reads from the opposite ends of an insert in double-barelled shotgun experiments (see figure).

[[Image:scaffold image.gif]]

Besides clone mate information, homology data, physical maps, or gene synteny information can be used to derive relationships between contigs. BAMBUS is a program written to handle all such types of data in a generic fashion in order to build contig scaffolds. In order to account for the varied quality of linking information BAMBUS allows the user to specify a hierarchy that will consider the most reliable linking data first, then expand the scaffolds using less reliable data. This approach minimizes the effect of errors inherent to experimentally-derived linking information.

=== Algorithm overview ===

BAMBUS has two main modes of operation: hierarchical - when the links are considered in the order of their priorities, and standard - when all the links are considered at the same priority level. In the latter case, all links connecting two contigs are bundled together to form a "gap", or "edge" if you look at the problem from a graph theoretical point of view. Each link is checked for validity before being added to a bundle. Thus, a link is considered invalid due to a length constraint if it forces the two contigs to overlap. In other words, if we assume the largest possible size for the insert corresponding to the link, if the coordinates of the mate-pair within the contigs force the contigs to overlap, we discard the link assuming it is due to a misassembly (Note: this behaviour can be specified on a link-by-link basis). Links that pass this test are further checked for consistency. If they do not all agree in the relative orientations of the contigs they imply, a majority rule is used to retain only those links that agree both in orientation and length. Furthermore, the bundle, or edge, is retained if it contains at least two links (this parameter, redundancy, can actually be tuned by the user). After this step is complete, the resulting graph gets traversed twice, once to assign consistent orientations to all the contigs, and the second time to determine the order of the contigs along the chromosome. Note that inconsistencies in order are currently allowed since they can provide finishing teams with useful information. In a future version of BAMBUS we will add an option to allow the generation of unambiguous scaffolds as well. Currently you can generate such unambiguous scaffolds using the [[#Untangling scaffolds|untangle]] program.

== Installation ==

=== Prerequisites ===

* Install AMOS
* Perl 5.6 or later
* [http://www.cpan.org/ XML::Parser] perl module
* [http://www.cpan.org/ Config::IniFiles] perl module
* [http://www.research.att.com/sw/tools/graphviz/ GraphViz] package

=== Obtaining BAMBUS ===

BAMBUS 2.3 is available free of charge under the open-source Artistic License.

The Bambus source if freely available for download from the File Release Section of our SourceForge project page.

To receive information regarding new releases and developments, please subscribe to our moderated, low-traffic users' mailing list:

amos-users(at)lists(dot)sourceforge(dot)net

=== Actual installation ===

1. Find a directory where you want the software installed. For example /users/home/CoolUser

2. Place the tar file bambus-2.33.tar.gz in this directory

3. Unpack the distribution
% tar xvzf bambus-2.33.tar.gz

4. Go into the newly created directory
% cd bambus-2.33

5. Carefully read the documentation

6. Edit the file Makefile and change the value of BASEDIR to the correct installation path, for example:
BASEDIR = /users/home/CoolUser

7. Also check if the path for perl (in variable PERL) matches the one on your system. Several common options are:
PERL = /usr/local/bin/perl
PERL = /usr/bin/perl

8. Type 'gmake install' to have all the software installed. The executables will be in /users/home/CoolUser/bin/ and the documentation in /users/home/CoolUser/doc.

9. To get started you can try out the small test dataset provided in data/test_run by typing:

goBambus -c test.contig -m test.mates -o test-bambus

== Running BAMBUS ==

The input to Bambus consists in a set of links between contigs. These links can be inferred from the pairing of reads belonging to the same insert, from physical map data, or from alignment to another genome. To accomodate these types of sources of linking information, and to allow for further extensions, the input to Bambus is presented in a general purpose XML format.

The following sections describe how the XML file can be generated for each type of linking data. For more information see [[#.evidence.xml|.evidence.xml]].

Linking information is grouped into "libraries". All links within a library have similar parameters. In the case of mate-pair linking information the libraries match the standard definition. For other linking data, for example MUMmer links, the library grouping reflects specific characteristics of the linking data. For example all MUMmer links are grouped within a library called "MUMmer".

=== Command line options ===

Bambus consists in a collection of programs controlled by a script called goBambus. The execution is controlled by the following sets of parameters. To obtain a list of all parameters and basic help information you need to type:

goBambus -h
or
goBambus -help

==== Configuration options ====

* -C <conf_file> . This parameter specifies the configuration file used by Bambus. A configuration file is not required, however it is useful if you want to change the default parameters. See [[#The configuration file|the configuration file]] for a detailed description of the configuration information.

Whenever you run Bambus it will generate a default configuration file called default.conf. If you want to modify the parameters, you will need to rename this file and then edit it.

==== Input Options ====

* -a <asm_file> . Use a TIGR Assembler .asm file as source of linking information.
* -c <contig_file> . Use a GDE formatted .contig assembly file as source of linking information. The .contig files can be obtained either from TIGR Assembler output (by concatenating the contents of the .align directory) or by converting .ace files (created by phrap or Consed) using the [[ace2contig]] package.
* -x <xml_file>. Use additional XML formatted linking information. You can use the -x option multiple times. See [[#.evidence.xml|.evidence.xml]] for the XML file format.
* -mx <make_file>. Use a gmake compatible make file to specify how the additional XML information is generated. Each of the files created must end in ".xml". Moreover, only the final XML targets are allowed to end in ".xml" to prevent goBambus from getting confused. See [[#Using Makefiles|using Makefiles]] for a more detailed description on using makefiles.

==== Mate-pair Information ====

Assembly output (such as .asm and .contig files) does not generally contain any information about the pairing of reads coming from opposite ends of the same insert, nor about the membership of inserts to libraries. The following parameters specify how to obtain this mate-pair information.

* -m <mates_file> . The mates file allows you to specify both library information and mate pairing information, using a system based on Perl regular expressions. See [[#The .mates file|the .mates file]].
* -D <database> . At TIGR, instead of a mates file you can opt to use a database to obtain library and mate information. Note that the -m and -D options are complementary. The following three parameters specify database options:
* -S <server> . Select a specific database server
* -U <user> . Log in with the specified user name
* -P <passwd> . Log in with the specified password

==== Output Options ====

* -o <output_prefix> . All the output file names will be generated from the <output_prefix> prefix.

==== Repeat Screening ====

Misassembled repeats can confuse Bambus, therefore it makes sense to screen the known repeats that may be misassembled. Bambus can use two types of repeat screening information:

* -r . Screen against the repeats listed in the database (option -D must be provided)
* -r <repfile> . Screen against repeats described in [[#Repeat files|repeat files]].

==== Flow-control ====

* -start <num>
* -end <num>. The execution of Bambus consists in a set of steps. The user can choose to start or end at a specific step for example in the case when she modifies the configuration file and wants to re-run just the final steps of the scaffolders. This feature should be used with caution.

=== Typical use cases ===
==== Using assembler output ====

For convenience, Bambus can use the output of an assembly program and automatically generate the required linking information. Two types of information are required:

* the tiling of reads in the assembly
* information about the pairing of reads

The first type of information can be obtained from the .asm or .contig output formats of TIGR Assembler. For more information about these file formats please refer to the [http://www.jcvi.org/cms/publications/listing/abstract/article/tigr-assembler-a-new-tool-for-assembling-large-shotgun-sequencing-projects/ TIGR Assembler documentation]. Note that the .contig file may be stripped of all sequence data as the only information used is that contained in the lines starting with #. In case you are using an assembler other than TIGR Assembler you may need to use the ta2ace package to convert the output into the .contig format.

The second type of information can be obtained from a [[#The .mates file|.mates file]], or from the database in case you are at TIGR.

A typical command line for someone outside of TIGR is:

goBambus -c test.contig -m test.mates -o test

and the execution will generate the following files:

* [[#.stats file|test.stats]] - statistics on scaffolds and libraries
* [[#.details file|test.details]] - detailed information about each contig pair (also called "gap").
* [[#.dot file|test.dot]] - GraphViz formatted description of the contig linkage information.
* [[#.evidence.xml|test.evidence.xml]] - XML representation of all the linking evidence provided.
* [[#.out.xml|test.out.xml]] - XML representation of the scaffolds.
* [[#.lib file|test.lib]] - list of the codes associated with each input library. These codes are reported on the links in the output file. 
* [[#.sum file|test.sum]] - one-line sumaries of all scaffolds.
* [[#.oo file|test.oo]] - order and orientation information for all the contigs.

Additionally, the program generates some "working" files that are currently not removed as they provide useful debugging information.

* test.detective.xml - linking information derived from assembler input alone. It's the same as test.evidence.xml unless additional XML files are provided in the input.
* test.inp - input to the core scaffolding engine.
* test.grommit.conf - configuration information for the scaffolding engine

To view the graphical output you need to convert it to Postscript with the command:

dot -Tps -o test.ps test.dot

Then you can view the postscript file with the command:

gv test.ps

At TIGR, instead of specifying a .mates file you can use the database:

goBambus -c test.contig -D gbx -U access -P access -o test

==== Getting more (or less) information from the output ====

You can modify the information you get in the output by directly calling the [[Bambus/printScaff|printScaff]] command.

==== Adding additional XML linking information ====

You can easily add additional linking information as long as it's in a format similar to the [[#.evidence.xml file|.evidence.xml file]]. Assuming we have two such files "link1.xml" and "link2.xml" you can call Bambus as follows:

goBambus -c test.contig -m test.mates -x link1.xml -x link2.xml -o test

The result being the same as if you concatenated the XML file inferred from the .contig file together with the two additional XML files.

==== Using MUMmer links ====

In case you are sequencing a genome for which a close relative has been completed, you can use [[MUMmer]] to infer links between contigs. Assume you have the reference genome in a file "ref.fasta" and all the contigs from the genome you are assembling in a file called "test.fasta", you can run MUMmer to align the test genome to the reference:

nucmer -maxmatch ref.fasta test.fasta

The output will be placed in a file called out.delta. You can then convert the alignment information into an XML file suitable for Bambus as follows:

show-tiling -x out.delta > test.mum.xml

And finally you can provide this information to Bambus:

goBambus -c test.contig -m test.mates -x test.mum.xml -o test

==== Using Makefiles ====

If you have multiple XML files, it becomes tedious to generate each file separately and then provide them to Bambus with multiple -x options. You can create a Makefile compatible with GNU make that generates each of the XML files. You must be careful that the targets for all the XML files end in ".xml", and no other targets (such as intermediate targets) have the same suffix. Bambus will assume that all .xml files produced by the Makefile must be added to the input.

An example of a simple Makefile that can generate the mummer information described above is:

test.mum.xml: ref.fasta test.fasta
nucmer -a max-match ref.fasta test.fasta
show-tiling -x out.delta > test.mum.xml

==== Untangling scaffolds ====

By default, the scaffolds resulting from Bambus are potentially ambiguous as two or more contigs may occupy the same place in the genome. Such situations occur either due to misassembled repeats, or when assembling different haplotypes. Bambus contains a utility that does a best effort attempt to disambiguate the scaffolds, by breaking them into a set of non-ambiguous scaffolds. The algorithm is greedy and does not guarantee that an optimal solution (e.g. a minimum number of longest scaffolds) is produced.

To run the untangler type:

untangle -e test.evidence.xml -s test.out.xml -o test.untangle.xml

The test.untangle.xml file has the same format as the output file ([[#.out.xml|.out.xml]]) and for each scaffold (e.g. scaff_1) contains one or more unambiguous scaffolds called "scaff_1_1, scaff_1_2, etc". You can use the [[#Getting more (or less) information from the output|printScaff]] command to process the untangled file the same way you would process a normal Bambus output.

== File formats ==

=== The configuration file ===

Scaffolding parameters can be specified in a configuration file. Whenever you run Bambus a default configuration file is generated as default.conf. To create your own file you should rename this file then edit it. The following types of information can be specified:

* Comments. Lines starting with # are considered comments and are ignored by the software.

* Priority information. Specifies an order in which the libraries are considered. For example, the following line specifies priority 2 for all MUMmer links. The libraries are considered in the increasing order of priorities.

priority MUMmer 2

If no configuration file is provided, all libraries are assumed to have the same priority. If a configuration file is provided, only those libraries that have a priority record are used in scaffolding. In other words, if a library has no priority record associated with it, it will not be used in scaffolding.

A priority can also be specified for a specific redundancy parameter. For example, the following line specifies that those links between contigs supported by 4 or more sources of linking data should be processed at priority level 1.

priority redundancy 4 1

Note, however, that you need to also specify a set of libraries to be used at this priority level. A simple shortcut is specifying that all link classes should be processed at this redundancy.

priority ALL 1

If you are uncertain of which libraries your project uses, run Bambus without a configuration file. The libraries will be listed in a file ending in "[[#.lib file|.libs]]".

* Redundancy information. Specifies the minimum number of links required for a valid inter-contig link. You can specify this parameter as a global value:

redundancy 2

that is, all contigs must be linked by at least two links. You can also specify a per-library value like below which says a single MUMmer link is required to link two contigs.

redundancy MUMmer 1

* Minimum scaffold size. Specifies a scaffold size cutoff. For example thisimplies that only scaffolds that contain more than 10kbp are present in the output.:

mingroupsize 10000

* Link size error. For some link types, determining the exact link length can only be done within a certain error. This parameter specifies, as a percentage, the estimated error in size determination. For example, since MUMmer links make an assumption of evolutionary closeness between two genomes, the error associated with them should be proportional to the estimated evolutionary distance (as nucleotide % similarity) between the two genomes. For an error of 5% use this parameter:

error MUMmer 0.05

* Overlapping contigs allowed. For each library you can control the link validation process by specifying whether adjacent contigs are allowed to overlap or not. By default mate-pair links assume adjacent contigs do not overlap, while MUMmer links allow contigs to overlap. This is equivalent to the configuration parameters:

overlaps lib_1 N
overlaps MUMmer Y

An example is provided in [[bambus.conf|data/sample_files/bambus.conf]].

=== The .mates file ===

The .mates file provides two types of information: library data, and mate-pair relationships between reads.

Library data can be described in two formats:
library <name> <min_size> <max_size>

and
library <name> <min_size> <max_size> <regexp>

Both formats require you to name each library and to provide a size range for the inserts belonging to it. The second format allows you to also provide a Perl regular expression that describes the naming convention for reads belonging to the library. The part of the sequence name that represents the library name must be placed within parentheses. As an example, the regular expression for TIGR sequences (where the library is specified by the first 4 characters) is:
(....).*

Mate-pair relationships can also be described in two ways:
pair <regexp_forw> <regexp_rev>

or
<seq_forw_1> <seq_rev_1> <library_name>
<seq_forw_2> <seq_rev_2> <library_name>
<seq_forw_3> <seq_rev_3> <library_name>
...

The first format requires two regular expressions corresponding to the forward and reverse mates of an insert. Just like the library record, the portion of the name corresponding to the insert name must be placed within parentheses. Two reads that match the two regular expressions, and have the exact same section matched within the parentheses will be considered mates. The library will be determined from the regular expression associated with a "library" record. As an example, at TIGR the first 7 characters represent the insert, then are followed by an optional T or P and the primer name (F or R for Forward or Reverse). The corresponding regular expression is:
pair (.......)[TP]?F (.......)[TP]?R

The second format is simply a list of sequence name pairs each followed by the name of the library they belong to. The corresponding library records must be listed in the file before the pairing data.

'''Note that fields in the .mates file must be separated by TAB characters otherwise the program will report an error.'''

An example of a .mates file using the phred/phrap naming convention is provided in [[bambus.mates|data/sample_files/bambus.mates]].

=== Repeat files ===

The repeat files used in screening linking data conform to the output standards of the repeatFinder program. The file contains 5 TAB-delimited values: contig ID, repeat name, left and right coordinates within the contig, and contig class. Only the first 4 values are essential as far as Bambus is concerned. Here is an example of repeat records:

AC009139.7.3 RPT1A 14554 96820 1
AC026498.3.2 RPT1B 124208 206475 1

=== .evidence.xml ===

This provides an XML representation of the input file. Below is a description of the data presented in this file (see also an [[bambus.evidence.xml|example]]):

The overall flow of the XML file is:

<EVIDENCE>
<LIBRARY>
<INSERT>
<SEQUENCE/>
<SEQUENCE/>
</INSERT>
...
</LIBRARY>
...
<CONTIG>
<SEQUENCE/>
...
</CONTIG>
...
<LINK>
<CONTIG/>
<CONTIG/>
</LINK>
...
</EVIDENCE>

<EVIDENCE this is the parent tag - the whole document occurs within <EVIDENCE> and </EVIDENCE>
ID = "1" some identifier for the file
DATE = "12/15/03" date when file was created
PROJECT = "MyProject" verbose description of the project
PARAMETERS = "" parameters used in creating this file
>

<LIBRARY for each shotgun library you need a separate library tag.
ID = "lib_1" some identifier for the library
NAME = "short" verbose name for the library
MIN = "1200" minimum size of inserts in this library
MAX = "3500" maximum size of inserts in this library
>

<INSERT the insert concept links together the two reads obtained from opposite ends of a clone insert
ID="ins_1" generic identifier for the insert
NAME="GALBZ92" verbose name for the insert
>

<SEQUENCE each insert contains two sequence reads obtained from to the opposite ends
ID="seq_1" generic identifier for the sequence
NAME="GALBZ92TF" verbose name for the sequence
>

<CONTIG Each contig in the assembly needs to be represented in the file
ID="contig_1" contig identifier
NAME= "1" contig name (usually the same as the identifier)
LEN="12352" contig length in basepairs
>

<SEQUENCE each sequence in the contig is listed together with information about the position within the contig.
ID="seq_1" sequence identifier (must match one described in the library section)
ORI="BE" orientation of sequence: BE - forward, EB - reverse
ASM_LEND="0" coordinate of sequence's left end within the contig
ASM_REND="525" coordinate of sequence's right end within the contig
>

<LINK this is a generic link between two contigs (i.e. not inferred from mate-pair data)
ID = "link_1" link identifier
SIZE = "-800" size of gap between the two contigs
TYPE = "MUMmer" link type: links with the same type get grouped into a virtual "library" used in specifying priorities
>

<CONTIG each link contains two contigs
ID = "contig_1" contig identifier: must match one described in the contig section above
ORI = "EB" contig orientation: BE - forward, EB - reverse
> each contig record may contain free-form data specifying the evidence for linking: e.g. alignment data

=== .out.xml ===

Represents, together with the evidence file, a description of the layout of the contigs. For each scaffold, the order (given as a coordinate along a chromosome) and the orientation of each contig is presented, together with a list of all the links used to generate this layout. Some links are deemed invalid, being given a code of "LEN" in case the length was deemed incorrect, or "ORI" in case the link orientation was considered incorrect. This file is also specified by the [[bambus.dtd|DTD]] and an example ([[bambus.out.xml|data/sample_files/bambus.out.xml]]).

=== .stats file ===

Finally, BAMBUS outputs a summary of the scaffolds generated. This file is pretty much self-explanatory, as evidenced by this example ([[bambus.stats|data/sample_files/bambus.stats]]). Note that in the .stats file, the N50 sizes are computed with respect to the total span of the scaffolds unless a different genome size is specified in a file called genome.size.

=== .details file ===

The .details file contains detailed information about the linking information between adjacent contigs. The contigs' orientation, size, and coordinates are listed and then all the linking data grouped by validity and library. An example is provided in [[bambus.details|data/sample_files/bambus.details]].

=== .dot file ===

Represents a graphical representation of the scaffolds in GraphViz format. Please see the [http://www.research.att.com/sw/tools/graphviz/ AT&T GraphViz] website for more information on this file format.

=== .lib file ===
Lists the libraries and their internal ID's

=== .ps file ===

A postscript image generated from the GraphViz-formatted file. It can be obtained from the .dot file with the command:
dot -Tps -o prefix.ps prefix.dot

An example is given in the picture below.
[[Image:bambus-display.jpg]]

Each scaffold is placed in a box, labeled with some statistics on the scaffold size (number of contigs, number of bases and span). All the "gaps" - linking relationships between the contigs are represented as edges. Each edge is decorated with the number of links contributing to it, the number of links from each library type, and the number of links invalidated due to incorrect length (L) or orientation (O). Each contig's ID, size, and coordinates within the scaffold are also listed.

=== .oo file ===

This file contains a summary of the order and orientation of all contigs present in the data. Each scaffold starts with a FASTA-like header containing the identifier for the scaffold followed by the number of contigs, size and span. Within each scaffold, all contigs are listed in the scaffold order, followed by the string BE for those in the forward orientation and EB for those in the reverse orientation. An example file is shown in: [[bambus.oo|data/sample_files/bambus.oo]].

=== .sum file ===

This file contains one line for each scaffold. Each line contains the scaffold ID, number of contigs, scaffold size, and span. The four values are separated by TAB characters. See an example in [[bambus.sum|data/sample_files/bambus.sum]].

== Known problems ==
There is a small "off-by-one" error in computing contig coordinates. It should not really affect the usefulness of the output.

== Contact information ==

BAMBUS is currently provided AS-IS, in other words we do not provide any support for the software. We would, however, like to hear your comments and suggestions. For Bambus bug reports, support requests, or any other inquiries please browse our SourceForge project page or Email us at:

amos-help (at) lists (dot) sourceforge (dot) net

BAMBUS was written by Mihai Pop and Dan Kosack.

Bambus Manual

2011-01-07T14:48:11Z

Dmb000006: /* .dot file */

{| align="right"
| __TOC__
|}

The output of most shotgun sequence assembly programs (such as TIGR Assembler, phrap, or CAP3) consists in a set of un-related contigs, whose order and orientation along the chromosome is unknown. Scaffolding represents the task of ordering and orienting these contigs by using additional information about their relative placement. Traditionally such information was identified from the pairing of reads from the opposite ends of an insert in double-barelled shotgun experiments (see figure).

[[Image:scaffold image.gif]]

Besides clone mate information, homology data, physical maps, or gene synteny information can be used to derive relationships between contigs. BAMBUS is a program written to handle all such types of data in a generic fashion in order to build contig scaffolds. In order to account for the varied quality of linking information BAMBUS allows the user to specify a hierarchy that will consider the most reliable linking data first, then expand the scaffolds using less reliable data. This approach minimizes the effect of errors inherent to experimentally-derived linking information.

=== Algorithm overview ===

BAMBUS has two main modes of operation: hierarchical - when the links are considered in the order of their priorities, and standard - when all the links are considered at the same priority level. In the latter case, all links connecting two contigs are bundled together to form a "gap", or "edge" if you look at the problem from a graph theoretical point of view. Each link is checked for validity before being added to a bundle. Thus, a link is considered invalid due to a length constraint if it forces the two contigs to overlap. In other words, if we assume the largest possible size for the insert corresponding to the link, if the coordinates of the mate-pair within the contigs force the contigs to overlap, we discard the link assuming it is due to a misassembly (Note: this behaviour can be specified on a link-by-link basis). Links that pass this test are further checked for consistency. If they do not all agree in the relative orientations of the contigs they imply, a majority rule is used to retain only those links that agree both in orientation and length. Furthermore, the bundle, or edge, is retained if it contains at least two links (this parameter, redundancy, can actually be tuned by the user). After this step is complete, the resulting graph gets traversed twice, once to assign consistent orientations to all the contigs, and the second time to determine the order of the contigs along the chromosome. Note that inconsistencies in order are currently allowed since they can provide finishing teams with useful information. In a future version of BAMBUS we will add an option to allow the generation of unambiguous scaffolds as well. Currently you can generate such unambiguous scaffolds using the [[#Untangling scaffolds|untangle]] program.

== Installation ==

=== Prerequisites ===

* Install AMOS
* Perl 5.6 or later
* [http://www.cpan.org/ XML::Parser] perl module
* [http://www.cpan.org/ Config::IniFiles] perl module
* [http://www.research.att.com/sw/tools/graphviz/ GraphViz] package

=== Obtaining BAMBUS ===

BAMBUS 2.3 is available free of charge under the open-source Artistic License.

The Bambus source if freely available for download from the File Release Section of our SourceForge project page.

To receive information regarding new releases and developments, please subscribe to our moderated, low-traffic users' mailing list:

amos-users(at)lists(dot)sourceforge(dot)net

=== Actual installation ===

1. Find a directory where you want the software installed. For example /users/home/CoolUser

2. Place the tar file bambus-2.33.tar.gz in this directory

3. Unpack the distribution
% tar xvzf bambus-2.33.tar.gz

4. Go into the newly created directory
% cd bambus-2.33

5. Carefully read the documentation

6. Edit the file Makefile and change the value of BASEDIR to the correct installation path, for example:
BASEDIR = /users/home/CoolUser

7. Also check if the path for perl (in variable PERL) matches the one on your system. Several common options are:
PERL = /usr/local/bin/perl
PERL = /usr/bin/perl

8. Type 'gmake install' to have all the software installed. The executables will be in /users/home/CoolUser/bin/ and the documentation in /users/home/CoolUser/doc.

9. To get started you can try out the small test dataset provided in data/test_run by typing:

goBambus -c test.contig -m test.mates -o test-bambus

== Running BAMBUS ==

The input to Bambus consists in a set of links between contigs. These links can be inferred from the pairing of reads belonging to the same insert, from physical map data, or from alignment to another genome. To accomodate these types of sources of linking information, and to allow for further extensions, the input to Bambus is presented in a general purpose XML format.

The following sections describe how the XML file can be generated for each type of linking data. For more information see [[#.evidence.xml|.evidence.xml]].

Linking information is grouped into "libraries". All links within a library have similar parameters. In the case of mate-pair linking information the libraries match the standard definition. For other linking data, for example MUMmer links, the library grouping reflects specific characteristics of the linking data. For example all MUMmer links are grouped within a library called "MUMmer".

=== Command line options ===

Bambus consists in a collection of programs controlled by a script called goBambus. The execution is controlled by the following sets of parameters. To obtain a list of all parameters and basic help information you need to type:

goBambus -h
or
goBambus -help

==== Configuration options ====

* -C <conf_file> . This parameter specifies the configuration file used by Bambus. A configuration file is not required, however it is useful if you want to change the default parameters. See [[#The configuration file|the configuration file]] for a detailed description of the configuration information.

Whenever you run Bambus it will generate a default configuration file called default.conf. If you want to modify the parameters, you will need to rename this file and then edit it.

==== Input Options ====

* -a <asm_file> . Use a TIGR Assembler .asm file as source of linking information.
* -c <contig_file> . Use a GDE formatted .contig assembly file as source of linking information. The .contig files can be obtained either from TIGR Assembler output (by concatenating the contents of the .align directory) or by converting .ace files (created by phrap or Consed) using the [[ace2contig]] package.
* -x <xml_file>. Use additional XML formatted linking information. You can use the -x option multiple times. See [[#.evidence.xml|.evidence.xml]] for the XML file format.
* -mx <make_file>. Use a gmake compatible make file to specify how the additional XML information is generated. Each of the files created must end in ".xml". Moreover, only the final XML targets are allowed to end in ".xml" to prevent goBambus from getting confused. See [[#Using Makefiles|using Makefiles]] for a more detailed description on using makefiles.

==== Mate-pair Information ====

Assembly output (such as .asm and .contig files) does not generally contain any information about the pairing of reads coming from opposite ends of the same insert, nor about the membership of inserts to libraries. The following parameters specify how to obtain this mate-pair information.

* -m <mates_file> . The mates file allows you to specify both library information and mate pairing information, using a system based on Perl regular expressions. See [[#The .mates file|the .mates file]].
* -D <database> . At TIGR, instead of a mates file you can opt to use a database to obtain library and mate information. Note that the -m and -D options are complementary. The following three parameters specify database options:
* -S <server> . Select a specific database server
* -U <user> . Log in with the specified user name
* -P <passwd> . Log in with the specified password

==== Output Options ====

* -o <output_prefix> . All the output file names will be generated from the <output_prefix> prefix.

==== Repeat Screening ====

Misassembled repeats can confuse Bambus, therefore it makes sense to screen the known repeats that may be misassembled. Bambus can use two types of repeat screening information:

* -r . Screen against the repeats listed in the database (option -D must be provided)
* -r <repfile> . Screen against repeats described in [[#Repeat files|repeat files]].

==== Flow-control ====

* -start <num>
* -end <num>. The execution of Bambus consists in a set of steps. The user can choose to start or end at a specific step for example in the case when she modifies the configuration file and wants to re-run just the final steps of the scaffolders. This feature should be used with caution.

=== Typical use cases ===
==== Using assembler output ====

For convenience, Bambus can use the output of an assembly program and automatically generate the required linking information. Two types of information are required:

* the tiling of reads in the assembly
* information about the pairing of reads

The first type of information can be obtained from the .asm or .contig output formats of TIGR Assembler. For more information about these file formats please refer to the [http://www.jcvi.org/cms/publications/listing/abstract/article/tigr-assembler-a-new-tool-for-assembling-large-shotgun-sequencing-projects/ TIGR Assembler documentation]. Note that the .contig file may be stripped of all sequence data as the only information used is that contained in the lines starting with #. In case you are using an assembler other than TIGR Assembler you may need to use the ta2ace package to convert the output into the .contig format.

The second type of information can be obtained from a [[#The .mates file|.mates file]], or from the database in case you are at TIGR.

A typical command line for someone outside of TIGR is:

goBambus -c test.contig -m test.mates -o test

and the execution will generate the following files:

* [[#.stats file|test.stats]] - statistics on scaffolds and libraries
* [[#.details file|test.details]] - detailed information about each contig pair (also called "gap").
* [[#.dot file|test.dot]] - GraphViz formatted description of the contig linkage information.
* [[#.evidence.xml|test.evidence.xml]] - XML representation of all the linking evidence provided.
* [[#.out.xml|test.out.xml]] - XML representation of the scaffolds.
* [[#.ps file|test.lib]] - list of the codes associated with each input library. These codes are reported on the links in the output file. 
* [[#.sum file|test.sum]] - one-line sumaries of all scaffolds.
* [[#.oo file|test.oo]] - order and orientation information for all the contigs.

Additionally, the program generates some "working" files that are currently not removed as they provide useful debugging information.

* test.detective.xml - linking information derived from assembler input alone. It's the same as test.evidence.xml unless additional XML files are provided in the input.
* test.inp - input to the core scaffolding engine.
* test.grommit.conf - configuration information for the scaffolding engine

To view the graphical output you need to convert it to Postscript with the command:

dot -Tps -o test.ps test.dot

Then you can view the postscript file with the command:

gv test.ps

At TIGR, instead of specifying a .mates file you can use the database:

goBambus -c test.contig -D gbx -U access -P access -o test

==== Getting more (or less) information from the output ====

You can modify the information you get in the output by directly calling the [[Bambus/printScaff|printScaff]] command.

==== Adding additional XML linking information ====

You can easily add additional linking information as long as it's in a format similar to the [[#.evidence.xml file|.evidence.xml file]]. Assuming we have two such files "link1.xml" and "link2.xml" you can call Bambus as follows:

goBambus -c test.contig -m test.mates -x link1.xml -x link2.xml -o test

The result being the same as if you concatenated the XML file inferred from the .contig file together with the two additional XML files.

==== Using MUMmer links ====

In case you are sequencing a genome for which a close relative has been completed, you can use [[MUMmer]] to infer links between contigs. Assume you have the reference genome in a file "ref.fasta" and all the contigs from the genome you are assembling in a file called "test.fasta", you can run MUMmer to align the test genome to the reference:

nucmer -maxmatch ref.fasta test.fasta

The output will be placed in a file called out.delta. You can then convert the alignment information into an XML file suitable for Bambus as follows:

show-tiling -x out.delta > test.mum.xml

And finally you can provide this information to Bambus:

goBambus -c test.contig -m test.mates -x test.mum.xml -o test

==== Using Makefiles ====

If you have multiple XML files, it becomes tedious to generate each file separately and then provide them to Bambus with multiple -x options. You can create a Makefile compatible with GNU make that generates each of the XML files. You must be careful that the targets for all the XML files end in ".xml", and no other targets (such as intermediate targets) have the same suffix. Bambus will assume that all .xml files produced by the Makefile must be added to the input.

An example of a simple Makefile that can generate the mummer information described above is:

test.mum.xml: ref.fasta test.fasta
nucmer -a max-match ref.fasta test.fasta
show-tiling -x out.delta > test.mum.xml

==== Untangling scaffolds ====

By default, the scaffolds resulting from Bambus are potentially ambiguous as two or more contigs may occupy the same place in the genome. Such situations occur either due to misassembled repeats, or when assembling different haplotypes. Bambus contains a utility that does a best effort attempt to disambiguate the scaffolds, by breaking them into a set of non-ambiguous scaffolds. The algorithm is greedy and does not guarantee that an optimal solution (e.g. a minimum number of longest scaffolds) is produced.

To run the untangler type:

untangle -e test.evidence.xml -s test.out.xml -o test.untangle.xml

The test.untangle.xml file has the same format as the output file ([[#.out.xml|.out.xml]]) and for each scaffold (e.g. scaff_1) contains one or more unambiguous scaffolds called "scaff_1_1, scaff_1_2, etc". You can use the [[#Getting more (or less) information from the output|printScaff]] command to process the untangled file the same way you would process a normal Bambus output.

== File formats ==

=== The configuration file ===

Scaffolding parameters can be specified in a configuration file. Whenever you run Bambus a default configuration file is generated as default.conf. To create your own file you should rename this file then edit it. The following types of information can be specified:

* Comments. Lines starting with # are considered comments and are ignored by the software.

* Priority information. Specifies an order in which the libraries are considered. For example, the following line specifies priority 2 for all MUMmer links. The libraries are considered in the increasing order of priorities.

priority MUMmer 2

If no configuration file is provided, all libraries are assumed to have the same priority. If a configuration file is provided, only those libraries that have a priority record are used in scaffolding. In other words, if a library has no priority record associated with it, it will not be used in scaffolding.

A priority can also be specified for a specific redundancy parameter. For example, the following line specifies that those links between contigs supported by 4 or more sources of linking data should be processed at priority level 1.

priority redundancy 4 1

Note, however, that you need to also specify a set of libraries to be used at this priority level. A simple shortcut is specifying that all link classes should be processed at this redundancy.

priority ALL 1

If you are uncertain of which libraries your project uses, run Bambus without a configuration file. The libraries will be listed in a file ending in ".libs".

* Redundancy information. Specifies the minimum number of links required for a valid inter-contig link. You can specify this parameter as a global value:

redundancy 2

that is, all contigs must be linked by at least two links. You can also specify a per-library value like below which says a single MUMmer link is required to link two contigs.

redundancy MUMmer 1

* Minimum scaffold size. Specifies a scaffold size cutoff. For example thisimplies that only scaffolds that contain more than 10kbp are present in the output.:

mingroupsize 10000

* Link size error. For some link types, determining the exact link length can only be done within a certain error. This parameter specifies, as a percentage, the estimated error in size determination. For example, since MUMmer links make an assumption of evolutionary closeness between two genomes, the error associated with them should be proportional to the estimated evolutionary distance (as nucleotide % similarity) between the two genomes. For an error of 5% use this parameter:

error MUMmer 0.05

* Overlapping contigs allowed. For each library you can control the link validation process by specifying whether adjacent contigs are allowed to overlap or not. By default mate-pair links assume adjacent contigs do not overlap, while MUMmer links allow contigs to overlap. This is equivalent to the configuration parameters:

overlaps lib_1 N
overlaps MUMmer Y

An example is provided in [[bambus.conf|data/sample_files/bambus.conf]].

=== The .mates file ===

The .mates file provides two types of information: library data, and mate-pair relationships between reads.

Library data can be described in two formats:
library <name> <min_size> <max_size>

and
library <name> <min_size> <max_size> <regexp>

Both formats require you to name each library and to provide a size range for the inserts belonging to it. The second format allows you to also provide a Perl regular expression that describes the naming convention for reads belonging to the library. The part of the sequence name that represents the library name must be placed within parentheses. As an example, the regular expression for TIGR sequences (where the library is specified by the first 4 characters) is:
(....).*

Mate-pair relationships can also be described in two ways:
pair <regexp_forw> <regexp_rev>

or
<seq_forw_1> <seq_rev_1> <library_name>
<seq_forw_2> <seq_rev_2> <library_name>
<seq_forw_3> <seq_rev_3> <library_name>
...

The first format requires two regular expressions corresponding to the forward and reverse mates of an insert. Just like the library record, the portion of the name corresponding to the insert name must be placed within parentheses. Two reads that match the two regular expressions, and have the exact same section matched within the parentheses will be considered mates. The library will be determined from the regular expression associated with a "library" record. As an example, at TIGR the first 7 characters represent the insert, then are followed by an optional T or P and the primer name (F or R for Forward or Reverse). The corresponding regular expression is:
pair (.......)[TP]?F (.......)[TP]?R

The second format is simply a list of sequence name pairs each followed by the name of the library they belong to. The corresponding library records must be listed in the file before the pairing data.

'''Note that fields in the .mates file must be separated by TAB characters otherwise the program will report an error.'''

An example of a .mates file using the phred/phrap naming convention is provided in [[bambus.mates|data/sample_files/bambus.mates]].

=== Repeat files ===

The repeat files used in screening linking data conform to the output standards of the repeatFinder program. The file contains 5 TAB-delimited values: contig ID, repeat name, left and right coordinates within the contig, and contig class. Only the first 4 values are essential as far as Bambus is concerned. Here is an example of repeat records:

AC009139.7.3 RPT1A 14554 96820 1
AC026498.3.2 RPT1B 124208 206475 1

=== .evidence.xml ===

This provides an XML representation of the input file. Below is a description of the data presented in this file (see also an [[bambus.evidence.xml|example]]):

The overall flow of the XML file is:

<EVIDENCE>
<LIBRARY>
<INSERT>
<SEQUENCE/>
<SEQUENCE/>
</INSERT>
...
</LIBRARY>
...
<CONTIG>
<SEQUENCE/>
...
</CONTIG>
...
<LINK>
<CONTIG/>
<CONTIG/>
</LINK>
...
</EVIDENCE>

<EVIDENCE this is the parent tag - the whole document occurs within <EVIDENCE> and </EVIDENCE>
ID = "1" some identifier for the file
DATE = "12/15/03" date when file was created
PROJECT = "MyProject" verbose description of the project
PARAMETERS = "" parameters used in creating this file
>

<LIBRARY for each shotgun library you need a separate library tag.
ID = "lib_1" some identifier for the library
NAME = "short" verbose name for the library
MIN = "1200" minimum size of inserts in this library
MAX = "3500" maximum size of inserts in this library
>

<INSERT the insert concept links together the two reads obtained from opposite ends of a clone insert
ID="ins_1" generic identifier for the insert
NAME="GALBZ92" verbose name for the insert
>

<SEQUENCE each insert contains two sequence reads obtained from to the opposite ends
ID="seq_1" generic identifier for the sequence
NAME="GALBZ92TF" verbose name for the sequence
>

<CONTIG Each contig in the assembly needs to be represented in the file
ID="contig_1" contig identifier
NAME= "1" contig name (usually the same as the identifier)
LEN="12352" contig length in basepairs
>

<SEQUENCE each sequence in the contig is listed together with information about the position within the contig.
ID="seq_1" sequence identifier (must match one described in the library section)
ORI="BE" orientation of sequence: BE - forward, EB - reverse
ASM_LEND="0" coordinate of sequence's left end within the contig
ASM_REND="525" coordinate of sequence's right end within the contig
>

<LINK this is a generic link between two contigs (i.e. not inferred from mate-pair data)
ID = "link_1" link identifier
SIZE = "-800" size of gap between the two contigs
TYPE = "MUMmer" link type: links with the same type get grouped into a virtual "library" used in specifying priorities
>

<CONTIG each link contains two contigs
ID = "contig_1" contig identifier: must match one described in the contig section above
ORI = "EB" contig orientation: BE - forward, EB - reverse
> each contig record may contain free-form data specifying the evidence for linking: e.g. alignment data

=== .out.xml ===

Represents, together with the evidence file, a description of the layout of the contigs. For each scaffold, the order (given as a coordinate along a chromosome) and the orientation of each contig is presented, together with a list of all the links used to generate this layout. Some links are deemed invalid, being given a code of "LEN" in case the length was deemed incorrect, or "ORI" in case the link orientation was considered incorrect. This file is also specified by the [[bambus.dtd|DTD]] and an example ([[bambus.out.xml|data/sample_files/bambus.out.xml]]).

=== .stats file ===

Finally, BAMBUS outputs a summary of the scaffolds generated. This file is pretty much self-explanatory, as evidenced by this example ([[bambus.stats|data/sample_files/bambus.stats]]). Note that in the .stats file, the N50 sizes are computed with respect to the total span of the scaffolds unless a different genome size is specified in a file called genome.size.

=== .details file ===

The .details file contains detailed information about the linking information between adjacent contigs. The contigs' orientation, size, and coordinates are listed and then all the linking data grouped by validity and library. An example is provided in [[bambus.details|data/sample_files/bambus.details]].

=== .dot file ===

Represents a graphical representation of the scaffolds in GraphViz format. Please see the [http://www.research.att.com/sw/tools/graphviz/ AT&T GraphViz] website for more information on this file format.

=== .lib file ===
Lists the libraries and their internal ID's

=== .ps file ===

A postscript image generated from the GraphViz-formatted file. It can be obtained from the .dot file with the command:
dot -Tps -o prefix.ps prefix.dot

An example is given in the picture below.
[[Image:bambus-display.jpg]]

Each scaffold is placed in a box, labeled with some statistics on the scaffold size (number of contigs, number of bases and span). All the "gaps" - linking relationships between the contigs are represented as edges. Each edge is decorated with the number of links contributing to it, the number of links from each library type, and the number of links invalidated due to incorrect length (L) or orientation (O). Each contig's ID, size, and coordinates within the scaffold are also listed.

=== .oo file ===

This file contains a summary of the order and orientation of all contigs present in the data. Each scaffold starts with a FASTA-like header containing the identifier for the scaffold followed by the number of contigs, size and span. Within each scaffold, all contigs are listed in the scaffold order, followed by the string BE for those in the forward orientation and EB for those in the reverse orientation. An example file is shown in: [[bambus.oo|data/sample_files/bambus.oo]].

=== .sum file ===

This file contains one line for each scaffold. Each line contains the scaffold ID, number of contigs, scaffold size, and span. The four values are separated by TAB characters. See an example in [[bambus.sum|data/sample_files/bambus.sum]].

== Known problems ==
There is a small "off-by-one" error in computing contig coordinates. It should not really affect the usefulness of the output.

== Contact information ==

BAMBUS is currently provided AS-IS, in other words we do not provide any support for the software. We would, however, like to hear your comments and suggestions. For Bambus bug reports, support requests, or any other inquiries please browse our SourceForge project page or Email us at:

amos-help (at) lists (dot) sourceforge (dot) net

BAMBUS was written by Mihai Pop and Dan Kosack.

Bambus Manual

2011-01-07T13:30:33Z

Dmb000006: /* Getting more (or less) information from the output */ Moving printScaff description onto a dedicated page. Wikis are good like this.

{| align="right"
| __TOC__
|}

The output of most shotgun sequence assembly programs (such as TIGR Assembler, phrap, or CAP3) consists in a set of un-related contigs, whose order and orientation along the chromosome is unknown. Scaffolding represents the task of ordering and orienting these contigs by using additional information about their relative placement. Traditionally such information was identified from the pairing of reads from the opposite ends of an insert in double-barelled shotgun experiments (see figure).

[[Image:scaffold image.gif]]

Besides clone mate information, homology data, physical maps, or gene synteny information can be used to derive relationships between contigs. BAMBUS is a program written to handle all such types of data in a generic fashion in order to build contig scaffolds. In order to account for the varied quality of linking information BAMBUS allows the user to specify a hierarchy that will consider the most reliable linking data first, then expand the scaffolds using less reliable data. This approach minimizes the effect of errors inherent to experimentally-derived linking information.

=== Algorithm overview ===

BAMBUS has two main modes of operation: hierarchical - when the links are considered in the order of their priorities, and standard - when all the links are considered at the same priority level. In the latter case, all links connecting two contigs are bundled together to form a "gap", or "edge" if you look at the problem from a graph theoretical point of view. Each link is checked for validity before being added to a bundle. Thus, a link is considered invalid due to a length constraint if it forces the two contigs to overlap. In other words, if we assume the largest possible size for the insert corresponding to the link, if the coordinates of the mate-pair within the contigs force the contigs to overlap, we discard the link assuming it is due to a misassembly (Note: this behaviour can be specified on a link-by-link basis). Links that pass this test are further checked for consistency. If they do not all agree in the relative orientations of the contigs they imply, a majority rule is used to retain only those links that agree both in orientation and length. Furthermore, the bundle, or edge, is retained if it contains at least two links (this parameter, redundancy, can actually be tuned by the user). After this step is complete, the resulting graph gets traversed twice, once to assign consistent orientations to all the contigs, and the second time to determine the order of the contigs along the chromosome. Note that inconsistencies in order are currently allowed since they can provide finishing teams with useful information. In a future version of BAMBUS we will add an option to allow the generation of unambiguous scaffolds as well. Currently you can generate such unambiguous scaffolds using the [[#Untangling scaffolds|untangle]] program.

== Installation ==

=== Prerequisites ===

* Install AMOS
* Perl 5.6 or later
* [http://www.cpan.org/ XML::Parser] perl module
* [http://www.cpan.org/ Config::IniFiles] perl module
* [http://www.research.att.com/sw/tools/graphviz/ GraphViz] package

=== Obtaining BAMBUS ===

BAMBUS 2.3 is available free of charge under the open-source Artistic License.

The Bambus source if freely available for download from the File Release Section of our SourceForge project page.

To receive information regarding new releases and developments, please subscribe to our moderated, low-traffic users' mailing list:

amos-users(at)lists(dot)sourceforge(dot)net

=== Actual installation ===

1. Find a directory where you want the software installed. For example /users/home/CoolUser

2. Place the tar file bambus-2.33.tar.gz in this directory

3. Unpack the distribution
% tar xvzf bambus-2.33.tar.gz

4. Go into the newly created directory
% cd bambus-2.33

5. Carefully read the documentation

6. Edit the file Makefile and change the value of BASEDIR to the correct installation path, for example:
BASEDIR = /users/home/CoolUser

7. Also check if the path for perl (in variable PERL) matches the one on your system. Several common options are:
PERL = /usr/local/bin/perl
PERL = /usr/bin/perl

8. Type 'gmake install' to have all the software installed. The executables will be in /users/home/CoolUser/bin/ and the documentation in /users/home/CoolUser/doc.

9. To get started you can try out the small test dataset provided in data/test_run by typing:

goBambus -c test.contig -m test.mates -o test-bambus

== Running BAMBUS ==

The input to Bambus consists in a set of links between contigs. These links can be inferred from the pairing of reads belonging to the same insert, from physical map data, or from alignment to another genome. To accomodate these types of sources of linking information, and to allow for further extensions, the input to Bambus is presented in a general purpose XML format.

The following sections describe how the XML file can be generated for each type of linking data. For more information see [[#.evidence.xml|.evidence.xml]].

Linking information is grouped into "libraries". All links within a library have similar parameters. In the case of mate-pair linking information the libraries match the standard definition. For other linking data, for example MUMmer links, the library grouping reflects specific characteristics of the linking data. For example all MUMmer links are grouped within a library called "MUMmer".

=== Command line options ===

Bambus consists in a collection of programs controlled by a script called goBambus. The execution is controlled by the following sets of parameters. To obtain a list of all parameters and basic help information you need to type:

goBambus -h
or
goBambus -help

==== Configuration options ====

* -C <conf_file> . This parameter specifies the configuration file used by Bambus. A configuration file is not required, however it is useful if you want to change the default parameters. See [[#The configuration file|the configuration file]] for a detailed description of the configuration information.

Whenever you run Bambus it will generate a default configuration file called default.conf. If you want to modify the parameters, you will need to rename this file and then edit it.

==== Input Options ====

* -a <asm_file> . Use a TIGR Assembler .asm file as source of linking information.
* -c <contig_file> . Use a GDE formatted .contig assembly file as source of linking information. The .contig files can be obtained either from TIGR Assembler output (by concatenating the contents of the .align directory) or by converting .ace files (created by phrap or Consed) using the [[ace2contig]] package.
* -x <xml_file>. Use additional XML formatted linking information. You can use the -x option multiple times. See [[#.evidence.xml|.evidence.xml]] for the XML file format.
* -mx <make_file>. Use a gmake compatible make file to specify how the additional XML information is generated. Each of the files created must end in ".xml". Moreover, only the final XML targets are allowed to end in ".xml" to prevent goBambus from getting confused. See [[#Using Makefiles|using Makefiles]] for a more detailed description on using makefiles.

==== Mate-pair Information ====

Assembly output (such as .asm and .contig files) does not generally contain any information about the pairing of reads coming from opposite ends of the same insert, nor about the membership of inserts to libraries. The following parameters specify how to obtain this mate-pair information.

* -m <mates_file> . The mates file allows you to specify both library information and mate pairing information, using a system based on Perl regular expressions. See [[#The .mates file|the .mates file]].
* -D <database> . At TIGR, instead of a mates file you can opt to use a database to obtain library and mate information. Note that the -m and -D options are complementary. The following three parameters specify database options:
* -S <server> . Select a specific database server
* -U <user> . Log in with the specified user name
* -P <passwd> . Log in with the specified password

==== Output Options ====

* -o <output_prefix> . All the output file names will be generated from the <output_prefix> prefix.

==== Repeat Screening ====

Misassembled repeats can confuse Bambus, therefore it makes sense to screen the known repeats that may be misassembled. Bambus can use two types of repeat screening information:

* -r . Screen against the repeats listed in the database (option -D must be provided)
* -r <repfile> . Screen against repeats described in [[#Repeat files|repeat files]].

==== Flow-control ====

* -start <num>
* -end <num>. The execution of Bambus consists in a set of steps. The user can choose to start or end at a specific step for example in the case when she modifies the configuration file and wants to re-run just the final steps of the scaffolders. This feature should be used with caution.

=== Typical use cases ===
==== Using assembler output ====

For convenience, Bambus can use the output of an assembly program and automatically generate the required linking information. Two types of information are required:

* the tiling of reads in the assembly
* information about the pairing of reads

The first type of information can be obtained from the .asm or .contig output formats of TIGR Assembler. For more information about these file formats please refer to the [http://www.jcvi.org/cms/publications/listing/abstract/article/tigr-assembler-a-new-tool-for-assembling-large-shotgun-sequencing-projects/ TIGR Assembler documentation]. Note that the .contig file may be stripped of all sequence data as the only information used is that contained in the lines starting with #. In case you are using an assembler other than TIGR Assembler you may need to use the ta2ace package to convert the output into the .contig format.

The second type of information can be obtained from a [[#The .mates file|.mates file]], or from the database in case you are at TIGR.

A typical command line for someone outside of TIGR is:

goBambus -c test.contig -m test.mates -o test

and the execution will generate the following files:

* [[#.stats file|test.stats]] - statistics on scaffolds and libraries
* [[#.details file|test.details]] - detailed information about each contig pair (also called "gap").
* [[#.dot file|test.dot]] - GraphViz formatted description of the contig linkage information.
* [[#.evidence.xml|test.evidence.xml]] - XML representation of all the linking evidence provided.
* [[#.out.xml|test.out.xml]] - XML representation of the scaffolds.
* [[#.ps file|test.lib]] - list of the codes associated with each input library. These codes are reported on the links in the output file. 
* [[#.sum file|test.sum]] - one-line sumaries of all scaffolds.
* [[#.oo file|test.oo]] - order and orientation information for all the contigs.

Additionally, the program generates some "working" files that are currently not removed as they provide useful debugging information.

* test.detective.xml - linking information derived from assembler input alone. It's the same as test.evidence.xml unless additional XML files are provided in the input.
* test.inp - input to the core scaffolding engine.
* test.grommit.conf - configuration information for the scaffolding engine

To view the graphical output you need to convert it to Postscript with the command:

dot -Tps -o test.ps test.dot

Then you can view the postscript file with the command:

gv test.ps

At TIGR, instead of specifying a .mates file you can use the database:

goBambus -c test.contig -D gbx -U access -P access -o test

==== Getting more (or less) information from the output ====

You can modify the information you get in the output by directly calling the [[Bambus/printScaff|printScaff]] command.

==== Adding additional XML linking information ====

You can easily add additional linking information as long as it's in a format similar to the [[#.evidence.xml file|.evidence.xml file]]. Assuming we have two such files "link1.xml" and "link2.xml" you can call Bambus as follows:

goBambus -c test.contig -m test.mates -x link1.xml -x link2.xml -o test

The result being the same as if you concatenated the XML file inferred from the .contig file together with the two additional XML files.

==== Using MUMmer links ====

In case you are sequencing a genome for which a close relative has been completed, you can use [[MUMmer]] to infer links between contigs. Assume you have the reference genome in a file "ref.fasta" and all the contigs from the genome you are assembling in a file called "test.fasta", you can run MUMmer to align the test genome to the reference:

nucmer -maxmatch ref.fasta test.fasta

The output will be placed in a file called out.delta. You can then convert the alignment information into an XML file suitable for Bambus as follows:

show-tiling -x out.delta > test.mum.xml

And finally you can provide this information to Bambus:

goBambus -c test.contig -m test.mates -x test.mum.xml -o test

==== Using Makefiles ====

If you have multiple XML files, it becomes tedious to generate each file separately and then provide them to Bambus with multiple -x options. You can create a Makefile compatible with GNU make that generates each of the XML files. You must be careful that the targets for all the XML files end in ".xml", and no other targets (such as intermediate targets) have the same suffix. Bambus will assume that all .xml files produced by the Makefile must be added to the input.

An example of a simple Makefile that can generate the mummer information described above is:

test.mum.xml: ref.fasta test.fasta
nucmer -a max-match ref.fasta test.fasta
show-tiling -x out.delta > test.mum.xml

==== Untangling scaffolds ====

By default, the scaffolds resulting from Bambus are potentially ambiguous as two or more contigs may occupy the same place in the genome. Such situations occur either due to misassembled repeats, or when assembling different haplotypes. Bambus contains a utility that does a best effort attempt to disambiguate the scaffolds, by breaking them into a set of non-ambiguous scaffolds. The algorithm is greedy and does not guarantee that an optimal solution (e.g. a minimum number of longest scaffolds) is produced.

To run the untangler type:

untangle -e test.evidence.xml -s test.out.xml -o test.untangle.xml

The test.untangle.xml file has the same format as the output file ([[#.out.xml|.out.xml]]) and for each scaffold (e.g. scaff_1) contains one or more unambiguous scaffolds called "scaff_1_1, scaff_1_2, etc". You can use the [[#Getting more (or less) information from the output|printScaff]] command to process the untangled file the same way you would process a normal Bambus output.

== File formats ==

=== The configuration file ===

Scaffolding parameters can be specified in a configuration file. Whenever you run Bambus a default configuration file is generated as default.conf. To create your own file you should rename this file then edit it. The following types of information can be specified:

* Comments. Lines starting with # are considered comments and are ignored by the software.

* Priority information. Specifies an order in which the libraries are considered. For example, the following line specifies priority 2 for all MUMmer links. The libraries are considered in the increasing order of priorities.

priority MUMmer 2

If no configuration file is provided, all libraries are assumed to have the same priority. If a configuration file is provided, only those libraries that have a priority record are used in scaffolding. In other words, if a library has no priority record associated with it, it will not be used in scaffolding.

A priority can also be specified for a specific redundancy parameter. For example, the following line specifies that those links between contigs supported by 4 or more sources of linking data should be processed at priority level 1.

priority redundancy 4 1

Note, however, that you need to also specify a set of libraries to be used at this priority level. A simple shortcut is specifying that all link classes should be processed at this redundancy.

priority ALL 1

If you are uncertain of which libraries your project uses, run Bambus without a configuration file. The libraries will be listed in a file ending in ".libs".

* Redundancy information. Specifies the minimum number of links required for a valid inter-contig link. You can specify this parameter as a global value:

redundancy 2

that is, all contigs must be linked by at least two links. You can also specify a per-library value like below which says a single MUMmer link is required to link two contigs.

redundancy MUMmer 1

* Minimum scaffold size. Specifies a scaffold size cutoff. For example thisimplies that only scaffolds that contain more than 10kbp are present in the output.:

mingroupsize 10000

* Link size error. For some link types, determining the exact link length can only be done within a certain error. This parameter specifies, as a percentage, the estimated error in size determination. For example, since MUMmer links make an assumption of evolutionary closeness between two genomes, the error associated with them should be proportional to the estimated evolutionary distance (as nucleotide % similarity) between the two genomes. For an error of 5% use this parameter:

error MUMmer 0.05

* Overlapping contigs allowed. For each library you can control the link validation process by specifying whether adjacent contigs are allowed to overlap or not. By default mate-pair links assume adjacent contigs do not overlap, while MUMmer links allow contigs to overlap. This is equivalent to the configuration parameters:

overlaps lib_1 N
overlaps MUMmer Y

An example is provided in [[bambus.conf|data/sample_files/bambus.conf]].

=== The .mates file ===

The .mates file provides two types of information: library data, and mate-pair relationships between reads.

Library data can be described in two formats:
library <name> <min_size> <max_size>

and
library <name> <min_size> <max_size> <regexp>

Both formats require you to name each library and to provide a size range for the inserts belonging to it. The second format allows you to also provide a Perl regular expression that describes the naming convention for reads belonging to the library. The part of the sequence name that represents the library name must be placed within parentheses. As an example, the regular expression for TIGR sequences (where the library is specified by the first 4 characters) is:
(....).*

Mate-pair relationships can also be described in two ways:
pair <regexp_forw> <regexp_rev>

or
<seq_forw_1> <seq_rev_1> <library_name>
<seq_forw_2> <seq_rev_2> <library_name>
<seq_forw_3> <seq_rev_3> <library_name>
...

The first format requires two regular expressions corresponding to the forward and reverse mates of an insert. Just like the library record, the portion of the name corresponding to the insert name must be placed within parentheses. Two reads that match the two regular expressions, and have the exact same section matched within the parentheses will be considered mates. The library will be determined from the regular expression associated with a "library" record. As an example, at TIGR the first 7 characters represent the insert, then are followed by an optional T or P and the primer name (F or R for Forward or Reverse). The corresponding regular expression is:
pair (.......)[TP]?F (.......)[TP]?R

The second format is simply a list of sequence name pairs each followed by the name of the library they belong to. The corresponding library records must be listed in the file before the pairing data.

'''Note that fields in the .mates file must be separated by TAB characters otherwise the program will report an error.'''

An example of a .mates file using the phred/phrap naming convention is provided in [[bambus.mates|data/sample_files/bambus.mates]].

=== Repeat files ===

The repeat files used in screening linking data conform to the output standards of the repeatFinder program. The file contains 5 TAB-delimited values: contig ID, repeat name, left and right coordinates within the contig, and contig class. Only the first 4 values are essential as far as Bambus is concerned. Here is an example of repeat records:

AC009139.7.3 RPT1A 14554 96820 1
AC026498.3.2 RPT1B 124208 206475 1

=== .evidence.xml ===

This provides an XML representation of the input file. Below is a description of the data presented in this file (see also an [[bambus.evidence.xml|example]]):

The overall flow of the XML file is:

<EVIDENCE>
<LIBRARY>
<INSERT>
<SEQUENCE/>
<SEQUENCE/>
</INSERT>
...
</LIBRARY>
...
<CONTIG>
<SEQUENCE/>
...
</CONTIG>
...
<LINK>
<CONTIG/>
<CONTIG/>
</LINK>
...
</EVIDENCE>

<EVIDENCE this is the parent tag - the whole document occurs within <EVIDENCE> and </EVIDENCE>
ID = "1" some identifier for the file
DATE = "12/15/03" date when file was created
PROJECT = "MyProject" verbose description of the project
PARAMETERS = "" parameters used in creating this file
>

<LIBRARY for each shotgun library you need a separate library tag.
ID = "lib_1" some identifier for the library
NAME = "short" verbose name for the library
MIN = "1200" minimum size of inserts in this library
MAX = "3500" maximum size of inserts in this library
>

<INSERT the insert concept links together the two reads obtained from opposite ends of a clone insert
ID="ins_1" generic identifier for the insert
NAME="GALBZ92" verbose name for the insert
>

<SEQUENCE each insert contains two sequence reads obtained from to the opposite ends
ID="seq_1" generic identifier for the sequence
NAME="GALBZ92TF" verbose name for the sequence
>

<CONTIG Each contig in the assembly needs to be represented in the file
ID="contig_1" contig identifier
NAME= "1" contig name (usually the same as the identifier)
LEN="12352" contig length in basepairs
>

<SEQUENCE each sequence in the contig is listed together with information about the position within the contig.
ID="seq_1" sequence identifier (must match one described in the library section)
ORI="BE" orientation of sequence: BE - forward, EB - reverse
ASM_LEND="0" coordinate of sequence's left end within the contig
ASM_REND="525" coordinate of sequence's right end within the contig
>

<LINK this is a generic link between two contigs (i.e. not inferred from mate-pair data)
ID = "link_1" link identifier
SIZE = "-800" size of gap between the two contigs
TYPE = "MUMmer" link type: links with the same type get grouped into a virtual "library" used in specifying priorities
>

<CONTIG each link contains two contigs
ID = "contig_1" contig identifier: must match one described in the contig section above
ORI = "EB" contig orientation: BE - forward, EB - reverse
> each contig record may contain free-form data specifying the evidence for linking: e.g. alignment data

=== .out.xml ===

Represents, together with the evidence file, a description of the layout of the contigs. For each scaffold, the order (given as a coordinate along a chromosome) and the orientation of each contig is presented, together with a list of all the links used to generate this layout. Some links are deemed invalid, being given a code of "LEN" in case the length was deemed incorrect, or "ORI" in case the link orientation was considered incorrect. This file is also specified by the [[bambus.dtd|DTD]] and an example ([[bambus.out.xml|data/sample_files/bambus.out.xml]]).

=== .stats file ===

Finally, BAMBUS outputs a summary of the scaffolds generated. This file is pretty much self-explanatory, as evidenced by this example ([[bambus.stats|data/sample_files/bambus.stats]]). Note that in the .stats file, the N50 sizes are computed with respect to the total span of the scaffolds unless a different genome size is specified in a file called genome.size.

=== .details file ===

The .details file contains detailed information about the linking information between adjacent contigs. The contigs' orientation, size, and coordinates are listed and then all the linking data grouped by validity and library. An example is provided in [[bambus.details|data/sample_files/bambus.details]].

=== .dot file ===

Represents a graphical representation of the scaffolds in GraphViz format. Please see the [http://www.research.att.com/sw/tools/graphviz/ AT&T GraphViz] website for more information on this file format.

=== .ps file ===

A postscript image generated from the GraphViz-formatted file. It can be obtained from the .dot file with the command:
dot -Tps -o prefix.ps prefix.dot

An example is given in the picture below.
[[Image:bambus-display.jpg]]

Each scaffold is placed in a box, labeled with some statistics on the scaffold size (number of contigs, number of bases and span). All the "gaps" - linking relationships between the contigs are represented as edges. Each edge is decorated with the number of links contributing to it, the number of links from each library type, and the number of links invalidated due to incorrect length (L) or orientation (O). Each contig's ID, size, and coordinates within the scaffold are also listed.

=== .oo file ===

This file contains a summary of the order and orientation of all contigs present in the data. Each scaffold starts with a FASTA-like header containing the identifier for the scaffold followed by the number of contigs, size and span. Within each scaffold, all contigs are listed in the scaffold order, followed by the string BE for those in the forward orientation and EB for those in the reverse orientation. An example file is shown in: [[bambus.oo|data/sample_files/bambus.oo]].

=== .sum file ===

This file contains one line for each scaffold. Each line contains the scaffold ID, number of contigs, scaffold size, and span. The four values are separated by TAB characters. See an example in [[bambus.sum|data/sample_files/bambus.sum]].

== Known problems ==
There is a small "off-by-one" error in computing contig coordinates. It should not really affect the usefulness of the output.

== Contact information ==

BAMBUS is currently provided AS-IS, in other words we do not provide any support for the software. We would, however, like to hear your comments and suggestions. For Bambus bug reports, support requests, or any other inquiries please browse our SourceForge project page or Email us at:

amos-help (at) lists (dot) sourceforge (dot) net

BAMBUS was written by Mihai Pop and Dan Kosack.

Bambus 2.0/quick start guide

2010-12-16T16:17:44Z

Dmb000006: /* Running Bambus 2.0 */ +formattting

This is a copy of the Bambus 2 user guide taken (and improved) from here: http://www.cbcb.umd.edu/software/bambus/doc/HowToBambus2.pdf

See also: http://www.cbcb.umd.edu/software/bambus

==How to run Bambus 2.0==
'''Caveat:''' Bambus is still being actively developed and the code is currently in the "user beware" and "for experts only" stage.

=== Step 1. Install the AMOS package - Bambus 2.0 is part of it. ===
See [[AMOS Getting Started]].

'''Note:''' since Bambus is still under active development you should pull the latest unofficial release of AMOS directly from the CVS repository - see instructions at: [[Programmer's guide]].

=== Step 2. What information you need ===
Bambus needs to know about the contigs produced by the assembler and information about how these contigs are linked to each other. In AMOS terms, the basic information necessary are a list of contigs (http://amos.sourceforge.net/docs/api/classAMOS_1_1Contig__t.html) and a list of contig links (http://amos.sourceforge.net/docs/api/classAMOS_1_1ContigLink__t.html) or contig edges (http://amos.sourceforge.net/docs/api/classAMOS_1_1ContigEdge__t.html - bundles of consistent contig links) indicating the relative placement of pairs of contigs.

These data can either be provided to Bambus directly in the form of a AMOS message file (see [[Message Types]]) or inferred from mate-pair information as described below.

== Running Bambus 2.0 ==
* First, add the .afg file built as described above (for other conversion utilities see: http://sourceforge.net/apps/mediawiki/amos/index.php?title=File_conversion_utilities) to an AMOS bank (flat-file database):
bank-transact -cf myproj.bnk -m myfile.afg

* Use the mate-pair information to construct a collection of contig links.
clk -b myproj.bnk

'''Note:''' that you can also construct these links with your own custom software and upload them into the bank in which case you would skip the "clk" command.

* Bundle the contig links into a collection of contig edges.
Bundler -b myproj.bnk

'''Note:''' as with the clk command you might want to build the contig edges separately and upload them into the bank using your own software.

'''Note:''' the Bundler command also accepts the command line parameter "-t" followed by a list of edge types as defined in src/AMOS/Link_AMOS.hh. Currently the following types are defined: '''M''' - mate-pair, '''O''' - overlap, '''P''' - physical, '''A''' - alignment, '''S''' - synteny, and '''X''' - other.

* Identify genomic repeats and output them to std out
MarkRepeats -b myproj.bnk [-redundancy X -aggressive] > myRepeats

Optional parameters:
:"-redundancy X" only uses contig edges comprising X or more contig links
:"-aggressive" - aggressive repeat identification based on global depth of coverage statistics (default procedure relies on graph analysis rather than coverage statistics)

'''Note:''' this program requires the boost library

* Order and orient contigs according to repeat and link information

'''IMPORTANT:''' several of the operations performed by this program destructively modify the bank (changes cannot be undone). You should make a copy of the bank prior to running OrientContigs.

OrientContigs -b myproj.bnk -prefix myscaff

:"-prefix" specifies the prefix for all output files

Optional parameters:
:"-all" - output unlinked contigs as scaffolds
:"-noreduce" - turns off graph simplification routines (see below)
:"-redundancy X" - same as above - ignore edges with less than X links
:"-repeats filename" - ignores repeats listed in "filename" (one contig ID per line) as generated, e.g. by the MarkRepeats :program described above.
:"-aggressive" - aggressive scaffolding - by default links that are stretched by more than 3 standard deviations are ignored. Aggressive option turns this feature off and tries to reconcile the scaffold as best possible.

* Linearize the scaffolds (if desired). By default Bambus 2 produces non-linear graph-based scaffolds. If fasta output is desired, it is necessary to linearize the scaffolds.
untangle -e myscaff.evidence.xml -s myscaff.out.xml -o myscaff.untangle.xml

* Output fasta result (if desired). This involves two steps, the first to generating the fasta file representing the contigs and the second combines them, separated by Ns, into a scaffold fasta file.
bank2fasta -d -b myproj.bnk > contigs.fasta
printScaff -e myscaff.evidence.xml -s myscaff.untangle.xml -l myscaff.library -f contigs.fasta -merge -o myscaff

== Outputs ==
The output of the OrientContigs program is a collection of scaffolds stored in the bank. The program also generates several files starting with the specified prefix
*myScaff.agp
**The scaffolds generated by the OrientContigs programs in NCBI AGP format
*myScaff.dot
**The scaffolds generated by the OrientContigs program in Graphviz dot format. It can be converted to a PostScript or PDF file using the dot program in the Graphviz package.
*myScaff.evidence.xml
*myScaff.library
*myScaff.out.xml
**The scaffolds generated by the OrientContigs program compatible with the Bambus 1 format.
*myScaff.fasta
**The fasta file of the scaffolds, joined by Ns
*myScaff.stats
**Statistics on the scaffolds generated, including N50 and total span.

=== Scaffold simplifications ===
By default (unless option "-noreduce" is provided) the OrientContigs program simplifies certain
graph patterns:
* simple paths
* bubbles
** These patterns are iteratively merged into single contigs until no additional simplifications can be made.

Bambus 2.0/quick start guide

2010-12-16T16:16:12Z

Dmb000006: /* How to run Bambus 2.0 */ May as well convert links to the wiki into wiki links

This is a copy of the Bambus 2 user guide taken (and improved) from here: http://www.cbcb.umd.edu/software/bambus/doc/HowToBambus2.pdf

See also: http://www.cbcb.umd.edu/software/bambus

==How to run Bambus 2.0==
'''Caveat:''' Bambus is still being actively developed and the code is currently in the "user beware" and "for experts only" stage.

=== Step 1. Install the AMOS package - Bambus 2.0 is part of it. ===
See [[AMOS Getting Started]].

'''Note:''' since Bambus is still under active development you should pull the latest unofficial release of AMOS directly from the CVS repository - see instructions at: [[Programmer's guide]].

=== Step 2. What information you need ===
Bambus needs to know about the contigs produced by the assembler and information about how these contigs are linked to each other. In AMOS terms, the basic information necessary are a list of contigs (http://amos.sourceforge.net/docs/api/classAMOS_1_1Contig__t.html) and a list of contig links (http://amos.sourceforge.net/docs/api/classAMOS_1_1ContigLink__t.html) or contig edges (http://amos.sourceforge.net/docs/api/classAMOS_1_1ContigEdge__t.html - bundles of consistent contig links) indicating the relative placement of pairs of contigs.

These data can either be provided to Bambus directly in the form of a AMOS message file (see [[Message Types]]) or inferred from mate-pair information as described below.

== Running Bambus 2.0 ==
* First, add the .afg file built as described above (for other conversion utilities see: http://sourceforge.net/apps/mediawiki/amos/index.php?title=File_conversion_utilities) to an AMOS bank (flat-file database):
bank-transact -cf myproj.bnk -m myfile.afg

* Use the mate-pair information to construct a collection of contig links.
clk -b myproj.bnk

'''Note:''' that you can also construct these links with your own custom software and upload them into the bank in which case you would skip the "clk" command.

* Bundle the contig links into a collection of contig edges.
Bundler -b myproj.bnk

'''Note:''' as with the clk command you might want to build the contig edges separately and upload them into the bank using your own software.

'''Note:''' the Bundler command also accepts the command line parameter "-t" followed by a list of edge types as defined in src/AMOS/Link_AMOS.hh. Currently the following types are defined: M- mate-pair, O - overlap, P - physical, A - alignment, S - synteny, X - other.

* Identify genomic repeats and output them to std out
MarkRepeats -b myproj.bnk [-redundancy X -aggressive] > myRepeats

Optional parameters:
:"-redundancy X" only uses contig edges comprising X or more contig links
:"-aggressive" - aggressive repeat identification based on global depth of coverage statistics (default procedure relies on graph analysis rather than coverage statistics)

'''Note:''' this program requires the boost library

* Order and orient contigs according to repeat and link information

'''IMPORTANT:''' several of the operations performed by this program destructively modify the bank (changes cannot be undone). You should make a copy of the bank prior to running OrientContigs.

OrientContigs -b myproj.bnk -prefix myscaff

:"-prefix" specifies the prefix for all output files

Optional parameters:
:"-all" - output unlinked contigs as scaffolds
:"-noreduce" - turns off graph simplification routines (see below)
:"-redundancy X" - same as above - ignore edges with less than X links
:"-repeats filename" - ignores repeats listed in "filename" (one contig ID per line) as generated, e.g. by the MarkRepeats :program described above.
:"-aggressive" - aggressive scaffolding - by default links that are stretched by more than 3 standard deviations are ignored. Aggressive option turns this feature off and tries to reconcile the scaffold as best possible.

* Linearize the scaffolds (if desired). By default Bambus 2 produces non-linear graph-based scaffolds. If fasta output is desired, it is necessary to linearize the scaffolds.
untangle -e myscaff.evidence.xml -s myscaff.out.xml -o myscaff.untangle.xml

* Output fasta result (if desired). This involves two steps, the first to generating the fasta file representing the contigs and the second combines them, separated by Ns, into a scaffold fasta file.
bank2fasta -d -b myproj.bnk > contigs.fasta
printScaff -e myscaff.evidence.xml -s myscaff.untangle.xml -l myscaff.library -f contigs.fasta -merge -o myscaff

== Outputs ==
The output of the OrientContigs program is a collection of scaffolds stored in the bank. The program also generates several files starting with the specified prefix
*myScaff.agp
**The scaffolds generated by the OrientContigs programs in NCBI AGP format
*myScaff.dot
**The scaffolds generated by the OrientContigs program in Graphviz dot format. It can be converted to a PostScript or PDF file using the dot program in the Graphviz package.
*myScaff.evidence.xml
*myScaff.library
*myScaff.out.xml
**The scaffolds generated by the OrientContigs program compatible with the Bambus 1 format.
*myScaff.fasta
**The fasta file of the scaffolds, joined by Ns
*myScaff.stats
**Statistics on the scaffolds generated, including N50 and total span.

=== Scaffold simplifications ===
By default (unless option "-noreduce" is provided) the OrientContigs program simplifies certain
graph patterns:
* simple paths
* bubbles
** These patterns are iteratively merged into single contigs until no additional simplifications can be made.

Bambus Manual

2010-12-16T15:26:00Z

Dmb000006: /* Getting more (or less) information from the output */

{| align="right"
| __TOC__
|}

The output of most shotgun sequence assembly programs (such as TIGR Assembler, phrap, or CAP3) consists in a set of un-related contigs, whose order and orientation along the chromosome is unknown. Scaffolding represents the task of ordering and orienting these contigs by using additional information about their relative placement. Traditionally such information was identified from the pairing of reads from the opposite ends of an insert in double-barelled shotgun experiments (see figure).

[[Image:scaffold image.gif]]

Besides clone mate information, homology data, physical maps, or gene synteny information can be used to derive relationships between contigs. BAMBUS is a program written to handle all such types of data in a generic fashion in order to build contig scaffolds. In order to account for the varied quality of linking information BAMBUS allows the user to specify a hierarchy that will consider the most reliable linking data first, then expand the scaffolds using less reliable data. This approach minimizes the effect of errors inherent to experimentally-derived linking information.

=== Algorithm overview ===

BAMBUS has two main modes of operation: hierarchical - when the links are considered in the order of their priorities, and standard - when all the links are considered at the same priority level. In the latter case, all links connecting two contigs are bundled together to form a "gap", or "edge" if you look at the problem from a graph theoretical point of view. Each link is checked for validity before being added to a bundle. Thus, a link is considered invalid due to a length constraint if it forces the two contigs to overlap. In other words, if we assume the largest possible size for the insert corresponding to the link, if the coordinates of the mate-pair within the contigs force the contigs to overlap, we discard the link assuming it is due to a misassembly (Note: this behaviour can be specified on a link-by-link basis). Links that pass this test are further checked for consistency. If they do not all agree in the relative orientations of the contigs they imply, a majority rule is used to retain only those links that agree both in orientation and length. Furthermore, the bundle, or edge, is retained if it contains at least two links (this parameter, redundancy, can actually be tuned by the user). After this step is complete, the resulting graph gets traversed twice, once to assign consistent orientations to all the contigs, and the second time to determine the order of the contigs along the chromosome. Note that inconsistencies in order are currently allowed since they can provide finishing teams with useful information. In a future version of BAMBUS we will add an option to allow the generation of unambiguous scaffolds as well. Currently you can generate such unambiguous scaffolds using the [[#Untangling scaffolds|untangle]] program.

== Installation ==

=== Prerequisites ===

* Install AMOS
* Perl 5.6 or later
* [http://www.cpan.org/ XML::Parser] perl module
* [http://www.cpan.org/ Config::IniFiles] perl module
* [http://www.research.att.com/sw/tools/graphviz/ GraphViz] package

=== Obtaining BAMBUS ===

BAMBUS 2.3 is available free of charge under the open-source Artistic License.

The Bambus source if freely available for download from the File Release Section of our SourceForge project page.

To receive information regarding new releases and developments, please subscribe to our moderated, low-traffic users' mailing list:

amos-users(at)lists(dot)sourceforge(dot)net

=== Actual installation ===

1. Find a directory where you want the software installed. For example /users/home/CoolUser

2. Place the tar file bambus-2.33.tar.gz in this directory

3. Unpack the distribution
% tar xvzf bambus-2.33.tar.gz

4. Go into the newly created directory
% cd bambus-2.33

5. Carefully read the documentation

6. Edit the file Makefile and change the value of BASEDIR to the correct installation path, for example:
BASEDIR = /users/home/CoolUser

7. Also check if the path for perl (in variable PERL) matches the one on your system. Several common options are:
PERL = /usr/local/bin/perl
PERL = /usr/bin/perl

8. Type 'gmake install' to have all the software installed. The executables will be in /users/home/CoolUser/bin/ and the documentation in /users/home/CoolUser/doc.

9. To get started you can try out the small test dataset provided in data/test_run by typing:

goBambus -c test.contig -m test.mates -o test-bambus

== Running BAMBUS ==

The input to Bambus consists in a set of links between contigs. These links can be inferred from the pairing of reads belonging to the same insert, from physical map data, or from alignment to another genome. To accomodate these types of sources of linking information, and to allow for further extensions, the input to Bambus is presented in a general purpose XML format.

The following sections describe how the XML file can be generated for each type of linking data. For more information see [[#.evidence.xml|.evidence.xml]].

Linking information is grouped into "libraries". All links within a library have similar parameters. In the case of mate-pair linking information the libraries match the standard definition. For other linking data, for example MUMmer links, the library grouping reflects specific characteristics of the linking data. For example all MUMmer links are grouped within a library called "MUMmer".

=== Command line options ===

Bambus consists in a collection of programs controlled by a script called goBambus. The execution is controlled by the following sets of parameters. To obtain a list of all parameters and basic help information you need to type:

goBambus -h
or
goBambus -help

==== Configuration options ====

* -C <conf_file> . This parameter specifies the configuration file used by Bambus. A configuration file is not required, however it is useful if you want to change the default parameters. See [[#The configuration file|the configuration file]] for a detailed description of the configuration information.

Whenever you run Bambus it will generate a default configuration file called default.conf. If you want to modify the parameters, you will need to rename this file and then edit it.

==== Input Options ====

* -a <asm_file> . Use a TIGR Assembler .asm file as source of linking information.
* -c <contig_file> . Use a GDE formatted .contig assembly file as source of linking information. The .contig files can be obtained either from TIGR Assembler output (by concatenating the contents of the .align directory) or by converting .ace files (created by phrap or Consed) using the [[ace2contig]] package.
* -x <xml_file>. Use additional XML formatted linking information. You can use the -x option multiple times. See [[#.evidence.xml|.evidence.xml]] for the XML file format.
* -mx <make_file>. Use a gmake compatible make file to specify how the additional XML information is generated. Each of the files created must end in ".xml". Moreover, only the final XML targets are allowed to end in ".xml" to prevent goBambus from getting confused. See [[#Using Makefiles|using Makefiles]] for a more detailed description on using makefiles.

==== Mate-pair Information ====

Assembly output (such as .asm and .contig files) does not generally contain any information about the pairing of reads coming from opposite ends of the same insert, nor about the membership of inserts to libraries. The following parameters specify how to obtain this mate-pair information.

* -m <mates_file> . The mates file allows you to specify both library information and mate pairing information, using a system based on Perl regular expressions. See [[#The .mates file|the .mates file]].
* -D <database> . At TIGR, instead of a mates file you can opt to use a database to obtain library and mate information. Note that the -m and -D options are complementary. The following three parameters specify database options:
* -S <server> . Select a specific database server
* -U <user> . Log in with the specified user name
* -P <passwd> . Log in with the specified password

==== Output Options ====

* -o <output_prefix> . All the output file names will be generated from the <output_prefix> prefix.

==== Repeat Screening ====

Misassembled repeats can confuse Bambus, therefore it makes sense to screen the known repeats that may be misassembled. Bambus can use two types of repeat screening information:

* -r . Screen against the repeats listed in the database (option -D must be provided)
* -r <repfile> . Screen against repeats described in [[#Repeat files|repeat files]].

==== Flow-control ====

* -start <num>
* -end <num>. The execution of Bambus consists in a set of steps. The user can choose to start or end at a specific step for example in the case when she modifies the configuration file and wants to re-run just the final steps of the scaffolders. This feature should be used with caution.

=== Typical use cases ===
==== Using assembler output ====

For convenience, Bambus can use the output of an assembly program and automatically generate the required linking information. Two types of information are required:

* the tiling of reads in the assembly
* information about the pairing of reads

The first type of information can be obtained from the .asm or .contig output formats of TIGR Assembler. For more information about these file formats please refer to the [http://www.jcvi.org/cms/publications/listing/abstract/article/tigr-assembler-a-new-tool-for-assembling-large-shotgun-sequencing-projects/ TIGR Assembler documentation]. Note that the .contig file may be stripped of all sequence data as the only information used is that contained in the lines starting with #. In case you are using an assembler other than TIGR Assembler you may need to use the ta2ace package to convert the output into the .contig format.

The second type of information can be obtained from a [[#The .mates file|.mates file]], or from the database in case you are at TIGR.

A typical command line for someone outside of TIGR is:

goBambus -c test.contig -m test.mates -o test

and the execution will generate the following files:

* [[#.stats file|test.stats]] - statistics on scaffolds and libraries
* [[#.details file|test.details]] - detailed information about each contig pair (also called "gap").
* [[#.dot file|test.dot]] - GraphViz formatted description of the contig linkage information.
* [[#.evidence.xml|test.evidence.xml]] - XML representation of all the linking evidence provided.
* [[#.out.xml|test.out.xml]] - XML representation of the scaffolds.
* [[#.ps file|test.lib]] - list of the codes associated with each input library. These codes are reported on the links in the output file. 
* [[#.sum file|test.sum]] - one-line sumaries of all scaffolds.
* [[#.oo file|test.oo]] - order and orientation information for all the contigs.

Additionally, the program generates some "working" files that are currently not removed as they provide useful debugging information.

* test.detective.xml - linking information derived from assembler input alone. It's the same as test.evidence.xml unless additional XML files are provided in the input.
* test.inp - input to the core scaffolding engine.
* test.grommit.conf - configuration information for the scaffolding engine

To view the graphical output you need to convert it to Postscript with the command:

dot -Tps -o test.ps test.dot

Then you can view the postscript file with the command:

gv test.ps

At TIGR, instead of specifying a .mates file you can use the database:

goBambus -c test.contig -D gbx -U access -P access -o test

==== Getting more (or less) information from the output ====

You can modify the information you get in the output by directly calling the printScaff command. PrintScaff requires as inputs the [[#.evidence.xml|.evidence.xml]] file, the [[#.out.xml|.out.xml]] file and the [[#.lib|.lib]] files produce by Bambus, the minimal invocation being:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib

Optionally you may specify the output prefix with option -o (just like in the Bambus invocation). PrintScaff has parameters that control what gets reported, and in what format.

In the first category you have:
* -dot - produce a .dot file
* -detail - produce a .details file
* -oo - produce a [[#.oo file|.oo]] file listing all the contigs in each scaffold
* -sum - produce a [[#.sum file|.sum]] tab delimited list of scaffold stats (#contigs, size, and span)
* -f <fasta_file> - generate a pseudo-molecule for each scaffold using the contig sequences listed in <fasta_file>. Note that the contigs in the scaffold files are names "contig_<id>" while the contigs in the <fasta_file> must simply be called "<id>". Note that this option together with the default -merge option (see below) is meaningless unless you've [[#Untangling scaffolds|untangled]] the scaffold.
* -phys - create a .phys file listing all "gaps" spanned only by the specified libraries:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib -phys MUMmer

will generate a file called test.phys that contains all contig pairs linked by nothing but MUMmer links.

In the second category you have:
* -page - produce .dot file formatted for printing on 8.5x11" paper
* -plot - produce .dot file formatted for printing on a plotter (36x48" paper)
* -unused - draw edges corresponding to unused links
* -merge - (default) when the -f option is given, produce a pseudo-molecule for each scaffold by adding 60 N characters between the contigs in the scaffold
* -nomerge - when the -f option is given, create a fasta file containing all contigs in each scaffold in the correct orientation

==== Adding additional XML linking information ====

You can easily add additional linking information as long as it's in a format similar to the [[#.evidence.xml file|.evidence.xml file]]. Assuming we have two such files "link1.xml" and "link2.xml" you can call Bambus as follows:

goBambus -c test.contig -m test.mates -x link1.xml -x link2.xml -o test

The result being the same as if you concatenated the XML file inferred from the .contig file together with the two additional XML files.

==== Using MUMmer links ====

In case you are sequencing a genome for which a close relative has been completed, you can use [[MUMmer]] to infer links between contigs. Assume you have the reference genome in a file "ref.fasta" and all the contigs from the genome you are assembling in a file called "test.fasta", you can run MUMmer to align the test genome to the reference:

nucmer -maxmatch ref.fasta test.fasta

The output will be placed in a file called out.delta. You can then convert the alignment information into an XML file suitable for Bambus as follows:

show-tiling -x out.delta > test.mum.xml

And finally you can provide this information to Bambus:

goBambus -c test.contig -m test.mates -x test.mum.xml -o test

==== Using Makefiles ====

If you have multiple XML files, it becomes tedious to generate each file separately and then provide them to Bambus with multiple -x options. You can create a Makefile compatible with GNU make that generates each of the XML files. You must be careful that the targets for all the XML files end in ".xml", and no other targets (such as intermediate targets) have the same suffix. Bambus will assume that all .xml files produced by the Makefile must be added to the input.

An example of a simple Makefile that can generate the mummer information described above is:

test.mum.xml: ref.fasta test.fasta
nucmer -a max-match ref.fasta test.fasta
show-tiling -x out.delta > test.mum.xml

==== Untangling scaffolds ====

By default, the scaffolds resulting from Bambus are potentially ambiguous as two or more contigs may occupy the same place in the genome. Such situations occur either due to misassembled repeats, or when assembling different haplotypes. Bambus contains a utility that does a best effort attempt to disambiguate the scaffolds, by breaking them into a set of non-ambiguous scaffolds. The algorithm is greedy and does not guarantee that an optimal solution (e.g. a minimum number of longest scaffolds) is produced.

To run the untangler type:

untangle -e test.evidence.xml -s test.out.xml -o test.untangle.xml

The test.untangle.xml file has the same format as the output file ([[#.out.xml|.out.xml]]) and for each scaffold (e.g. scaff_1) contains one or more unambiguous scaffolds called "scaff_1_1, scaff_1_2, etc". You can use the [[#Getting more (or less) information from the output|printScaff]] command to process the untangled file the same way you would process a normal Bambus output.

== File formats ==

=== The configuration file ===

Scaffolding parameters can be specified in a configuration file. Whenever you run Bambus a default configuration file is generated as default.conf. To create your own file you should rename this file then edit it. The following types of information can be specified:

* Comments. Lines starting with # are considered comments and are ignored by the software.

* Priority information. Specifies an order in which the libraries are considered. For example, the following line specifies priority 2 for all MUMmer links. The libraries are considered in the increasing order of priorities.

priority MUMmer 2

If no configuration file is provided, all libraries are assumed to have the same priority. If a configuration file is provided, only those libraries that have a priority record are used in scaffolding. In other words, if a library has no priority record associated with it, it will not be used in scaffolding.

A priority can also be specified for a specific redundancy parameter. For example, the following line specifies that those links between contigs supported by 4 or more sources of linking data should be processed at priority level 1.

priority redundancy 4 1

Note, however, that you need to also specify a set of libraries to be used at this priority level. A simple shortcut is specifying that all link classes should be processed at this redundancy.

priority ALL 1

If you are uncertain of which libraries your project uses, run Bambus without a configuration file. The libraries will be listed in a file ending in ".libs".

* Redundancy information. Specifies the minimum number of links required for a valid inter-contig link. You can specify this parameter as a global value:

redundancy 2

that is, all contigs must be linked by at least two links. You can also specify a per-library value like below which says a single MUMmer link is required to link two contigs.

redundancy MUMmer 1

* Minimum scaffold size. Specifies a scaffold size cutoff. For example thisimplies that only scaffolds that contain more than 10kbp are present in the output.:

mingroupsize 10000

* Link size error. For some link types, determining the exact link length can only be done within a certain error. This parameter specifies, as a percentage, the estimated error in size determination. For example, since MUMmer links make an assumption of evolutionary closeness between two genomes, the error associated with them should be proportional to the estimated evolutionary distance (as nucleotide % similarity) between the two genomes. For an error of 5% use this parameter:

error MUMmer 0.05

* Overlapping contigs allowed. For each library you can control the link validation process by specifying whether adjacent contigs are allowed to overlap or not. By default mate-pair links assume adjacent contigs do not overlap, while MUMmer links allow contigs to overlap. This is equivalent to the configuration parameters:

overlaps lib_1 N
overlaps MUMmer Y

An example is provided in [[bambus.conf|data/sample_files/bambus.conf]].

=== The .mates file ===

The .mates file provides two types of information: library data, and mate-pair relationships between reads.

Library data can be described in two formats:
library <name> <min_size> <max_size>

and
library <name> <min_size> <max_size> <regexp>

Both formats require you to name each library and to provide a size range for the inserts belonging to it. The second format allows you to also provide a Perl regular expression that describes the naming convention for reads belonging to the library. The part of the sequence name that represents the library name must be placed within parentheses. As an example, the regular expression for TIGR sequences (where the library is specified by the first 4 characters) is:
(....).*

Mate-pair relationships can also be described in two ways:
pair <regexp_forw> <regexp_rev>

or
<seq_forw_1> <seq_rev_1> <library_name>
<seq_forw_2> <seq_rev_2> <library_name>
<seq_forw_3> <seq_rev_3> <library_name>
...

The first format requires two regular expressions corresponding to the forward and reverse mates of an insert. Just like the library record, the portion of the name corresponding to the insert name must be placed within parentheses. Two reads that match the two regular expressions, and have the exact same section matched within the parentheses will be considered mates. The library will be determined from the regular expression associated with a "library" record. As an example, at TIGR the first 7 characters represent the insert, then are followed by an optional T or P and the primer name (F or R for Forward or Reverse). The corresponding regular expression is:
pair (.......)[TP]?F (.......)[TP]?R

The second format is simply a list of sequence name pairs each followed by the name of the library they belong to. The corresponding library records must be listed in the file before the pairing data.

'''Note that fields in the .mates file must be separated by TAB characters otherwise the program will report an error.'''

An example of a .mates file using the phred/phrap naming convention is provided in [[bambus.mates|data/sample_files/bambus.mates]].

=== Repeat files ===

The repeat files used in screening linking data conform to the output standards of the repeatFinder program. The file contains 5 TAB-delimited values: contig ID, repeat name, left and right coordinates within the contig, and contig class. Only the first 4 values are essential as far as Bambus is concerned. Here is an example of repeat records:

AC009139.7.3 RPT1A 14554 96820 1
AC026498.3.2 RPT1B 124208 206475 1

=== .evidence.xml ===

This provides an XML representation of the input file. Below is a description of the data presented in this file (see also an [[bambus.evidence.xml|example]]):

The overall flow of the XML file is:

<EVIDENCE>
<LIBRARY>
<INSERT>
<SEQUENCE/>
<SEQUENCE/>
</INSERT>
...
</LIBRARY>
...
<CONTIG>
<SEQUENCE/>
...
</CONTIG>
...
<LINK>
<CONTIG/>
<CONTIG/>
</LINK>
...
</EVIDENCE>

<EVIDENCE this is the parent tag - the whole document occurs within <EVIDENCE> and </EVIDENCE>
ID = "1" some identifier for the file
DATE = "12/15/03" date when file was created
PROJECT = "MyProject" verbose description of the project
PARAMETERS = "" parameters used in creating this file
>

<LIBRARY for each shotgun library you need a separate library tag.
ID = "lib_1" some identifier for the library
NAME = "short" verbose name for the library
MIN = "1200" minimum size of inserts in this library
MAX = "3500" maximum size of inserts in this library
>

<INSERT the insert concept links together the two reads obtained from opposite ends of a clone insert
ID="ins_1" generic identifier for the insert
NAME="GALBZ92" verbose name for the insert
>

<SEQUENCE each insert contains two sequence reads obtained from to the opposite ends
ID="seq_1" generic identifier for the sequence
NAME="GALBZ92TF" verbose name for the sequence
>

<CONTIG Each contig in the assembly needs to be represented in the file
ID="contig_1" contig identifier
NAME= "1" contig name (usually the same as the identifier)
LEN="12352" contig length in basepairs
>

<SEQUENCE each sequence in the contig is listed together with information about the position within the contig.
ID="seq_1" sequence identifier (must match one described in the library section)
ORI="BE" orientation of sequence: BE - forward, EB - reverse
ASM_LEND="0" coordinate of sequence's left end within the contig
ASM_REND="525" coordinate of sequence's right end within the contig
>

<LINK this is a generic link between two contigs (i.e. not inferred from mate-pair data)
ID = "link_1" link identifier
SIZE = "-800" size of gap between the two contigs
TYPE = "MUMmer" link type: links with the same type get grouped into a virtual "library" used in specifying priorities
>

<CONTIG each link contains two contigs
ID = "contig_1" contig identifier: must match one described in the contig section above
ORI = "EB" contig orientation: BE - forward, EB - reverse
> each contig record may contain free-form data specifying the evidence for linking: e.g. alignment data

=== .out.xml ===

Represents, together with the evidence file, a description of the layout of the contigs. For each scaffold, the order (given as a coordinate along a chromosome) and the orientation of each contig is presented, together with a list of all the links used to generate this layout. Some links are deemed invalid, being given a code of "LEN" in case the length was deemed incorrect, or "ORI" in case the link orientation was considered incorrect. This file is also specified by the [[bambus.dtd|DTD]] and an example ([[bambus.out.xml|data/sample_files/bambus.out.xml]]).

=== .stats file ===

Finally, BAMBUS outputs a summary of the scaffolds generated. This file is pretty much self-explanatory, as evidenced by this example ([[bambus.stats|data/sample_files/bambus.stats]]). Note that in the .stats file, the N50 sizes are computed with respect to the total span of the scaffolds unless a different genome size is specified in a file called genome.size.

=== .details file ===

The .details file contains detailed information about the linking information between adjacent contigs. The contigs' orientation, size, and coordinates are listed and then all the linking data grouped by validity and library. An example is provided in [[bambus.details|data/sample_files/bambus.details]].

=== .dot file ===

Represents a graphical representation of the scaffolds in GraphViz format. Please see the [http://www.research.att.com/sw/tools/graphviz/ AT&T GraphViz] website for more information on this file format.

=== .ps file ===

A postscript image generated from the GraphViz-formatted file. It can be obtained from the .dot file with the command:
dot -Tps -o prefix.ps prefix.dot

An example is given in the picture below.
[[Image:bambus-display.jpg]]

Each scaffold is placed in a box, labeled with some statistics on the scaffold size (number of contigs, number of bases and span). All the "gaps" - linking relationships between the contigs are represented as edges. Each edge is decorated with the number of links contributing to it, the number of links from each library type, and the number of links invalidated due to incorrect length (L) or orientation (O). Each contig's ID, size, and coordinates within the scaffold are also listed.

=== .oo file ===

This file contains a summary of the order and orientation of all contigs present in the data. Each scaffold starts with a FASTA-like header containing the identifier for the scaffold followed by the number of contigs, size and span. Within each scaffold, all contigs are listed in the scaffold order, followed by the string BE for those in the forward orientation and EB for those in the reverse orientation. An example file is shown in: [[bambus.oo|data/sample_files/bambus.oo]].

=== .sum file ===

This file contains one line for each scaffold. Each line contains the scaffold ID, number of contigs, scaffold size, and span. The four values are separated by TAB characters. See an example in [[bambus.sum|data/sample_files/bambus.sum]].

== Known problems ==
There is a small "off-by-one" error in computing contig coordinates. It should not really affect the usefulness of the output.

== Contact information ==

BAMBUS is currently provided AS-IS, in other words we do not provide any support for the software. We would, however, like to hear your comments and suggestions. For Bambus bug reports, support requests, or any other inquiries please browse our SourceForge project page or Email us at:

amos-help (at) lists (dot) sourceforge (dot) net

BAMBUS was written by Mihai Pop and Dan Kosack.

Bambus Manual

2010-12-16T15:15:44Z

Dmb000006: /* Using assembler output */ Found an error I'm cautious to fix

{| align="right"
| __TOC__
|}

The output of most shotgun sequence assembly programs (such as TIGR Assembler, phrap, or CAP3) consists in a set of un-related contigs, whose order and orientation along the chromosome is unknown. Scaffolding represents the task of ordering and orienting these contigs by using additional information about their relative placement. Traditionally such information was identified from the pairing of reads from the opposite ends of an insert in double-barelled shotgun experiments (see figure).

[[Image:scaffold image.gif]]

Besides clone mate information, homology data, physical maps, or gene synteny information can be used to derive relationships between contigs. BAMBUS is a program written to handle all such types of data in a generic fashion in order to build contig scaffolds. In order to account for the varied quality of linking information BAMBUS allows the user to specify a hierarchy that will consider the most reliable linking data first, then expand the scaffolds using less reliable data. This approach minimizes the effect of errors inherent to experimentally-derived linking information.

=== Algorithm overview ===

BAMBUS has two main modes of operation: hierarchical - when the links are considered in the order of their priorities, and standard - when all the links are considered at the same priority level. In the latter case, all links connecting two contigs are bundled together to form a "gap", or "edge" if you look at the problem from a graph theoretical point of view. Each link is checked for validity before being added to a bundle. Thus, a link is considered invalid due to a length constraint if it forces the two contigs to overlap. In other words, if we assume the largest possible size for the insert corresponding to the link, if the coordinates of the mate-pair within the contigs force the contigs to overlap, we discard the link assuming it is due to a misassembly (Note: this behaviour can be specified on a link-by-link basis). Links that pass this test are further checked for consistency. If they do not all agree in the relative orientations of the contigs they imply, a majority rule is used to retain only those links that agree both in orientation and length. Furthermore, the bundle, or edge, is retained if it contains at least two links (this parameter, redundancy, can actually be tuned by the user). After this step is complete, the resulting graph gets traversed twice, once to assign consistent orientations to all the contigs, and the second time to determine the order of the contigs along the chromosome. Note that inconsistencies in order are currently allowed since they can provide finishing teams with useful information. In a future version of BAMBUS we will add an option to allow the generation of unambiguous scaffolds as well. Currently you can generate such unambiguous scaffolds using the [[#Untangling scaffolds|untangle]] program.

== Installation ==

=== Prerequisites ===

* Install AMOS
* Perl 5.6 or later
* [http://www.cpan.org/ XML::Parser] perl module
* [http://www.cpan.org/ Config::IniFiles] perl module
* [http://www.research.att.com/sw/tools/graphviz/ GraphViz] package

=== Obtaining BAMBUS ===

BAMBUS 2.3 is available free of charge under the open-source Artistic License.

The Bambus source if freely available for download from the File Release Section of our SourceForge project page.

To receive information regarding new releases and developments, please subscribe to our moderated, low-traffic users' mailing list:

amos-users(at)lists(dot)sourceforge(dot)net

=== Actual installation ===

1. Find a directory where you want the software installed. For example /users/home/CoolUser

2. Place the tar file bambus-2.33.tar.gz in this directory

3. Unpack the distribution
% tar xvzf bambus-2.33.tar.gz

4. Go into the newly created directory
% cd bambus-2.33

5. Carefully read the documentation

6. Edit the file Makefile and change the value of BASEDIR to the correct installation path, for example:
BASEDIR = /users/home/CoolUser

7. Also check if the path for perl (in variable PERL) matches the one on your system. Several common options are:
PERL = /usr/local/bin/perl
PERL = /usr/bin/perl

8. Type 'gmake install' to have all the software installed. The executables will be in /users/home/CoolUser/bin/ and the documentation in /users/home/CoolUser/doc.

9. To get started you can try out the small test dataset provided in data/test_run by typing:

goBambus -c test.contig -m test.mates -o test-bambus

== Running BAMBUS ==

The input to Bambus consists in a set of links between contigs. These links can be inferred from the pairing of reads belonging to the same insert, from physical map data, or from alignment to another genome. To accomodate these types of sources of linking information, and to allow for further extensions, the input to Bambus is presented in a general purpose XML format.

The following sections describe how the XML file can be generated for each type of linking data. For more information see [[#.evidence.xml|.evidence.xml]].

Linking information is grouped into "libraries". All links within a library have similar parameters. In the case of mate-pair linking information the libraries match the standard definition. For other linking data, for example MUMmer links, the library grouping reflects specific characteristics of the linking data. For example all MUMmer links are grouped within a library called "MUMmer".

=== Command line options ===

Bambus consists in a collection of programs controlled by a script called goBambus. The execution is controlled by the following sets of parameters. To obtain a list of all parameters and basic help information you need to type:

goBambus -h
or
goBambus -help

==== Configuration options ====

* -C <conf_file> . This parameter specifies the configuration file used by Bambus. A configuration file is not required, however it is useful if you want to change the default parameters. See [[#The configuration file|the configuration file]] for a detailed description of the configuration information.

Whenever you run Bambus it will generate a default configuration file called default.conf. If you want to modify the parameters, you will need to rename this file and then edit it.

==== Input Options ====

* -a <asm_file> . Use a TIGR Assembler .asm file as source of linking information.
* -c <contig_file> . Use a GDE formatted .contig assembly file as source of linking information. The .contig files can be obtained either from TIGR Assembler output (by concatenating the contents of the .align directory) or by converting .ace files (created by phrap or Consed) using the [[ace2contig]] package.
* -x <xml_file>. Use additional XML formatted linking information. You can use the -x option multiple times. See [[#.evidence.xml|.evidence.xml]] for the XML file format.
* -mx <make_file>. Use a gmake compatible make file to specify how the additional XML information is generated. Each of the files created must end in ".xml". Moreover, only the final XML targets are allowed to end in ".xml" to prevent goBambus from getting confused. See [[#Using Makefiles|using Makefiles]] for a more detailed description on using makefiles.

==== Mate-pair Information ====

Assembly output (such as .asm and .contig files) does not generally contain any information about the pairing of reads coming from opposite ends of the same insert, nor about the membership of inserts to libraries. The following parameters specify how to obtain this mate-pair information.

* -m <mates_file> . The mates file allows you to specify both library information and mate pairing information, using a system based on Perl regular expressions. See [[#The .mates file|the .mates file]].
* -D <database> . At TIGR, instead of a mates file you can opt to use a database to obtain library and mate information. Note that the -m and -D options are complementary. The following three parameters specify database options:
* -S <server> . Select a specific database server
* -U <user> . Log in with the specified user name
* -P <passwd> . Log in with the specified password

==== Output Options ====

* -o <output_prefix> . All the output file names will be generated from the <output_prefix> prefix.

==== Repeat Screening ====

Misassembled repeats can confuse Bambus, therefore it makes sense to screen the known repeats that may be misassembled. Bambus can use two types of repeat screening information:

* -r . Screen against the repeats listed in the database (option -D must be provided)
* -r <repfile> . Screen against repeats described in [[#Repeat files|repeat files]].

==== Flow-control ====

* -start <num>
* -end <num>. The execution of Bambus consists in a set of steps. The user can choose to start or end at a specific step for example in the case when she modifies the configuration file and wants to re-run just the final steps of the scaffolders. This feature should be used with caution.

=== Typical use cases ===
==== Using assembler output ====

For convenience, Bambus can use the output of an assembly program and automatically generate the required linking information. Two types of information are required:

* the tiling of reads in the assembly
* information about the pairing of reads

The first type of information can be obtained from the .asm or .contig output formats of TIGR Assembler. For more information about these file formats please refer to the [http://www.jcvi.org/cms/publications/listing/abstract/article/tigr-assembler-a-new-tool-for-assembling-large-shotgun-sequencing-projects/ TIGR Assembler documentation]. Note that the .contig file may be stripped of all sequence data as the only information used is that contained in the lines starting with #. In case you are using an assembler other than TIGR Assembler you may need to use the ta2ace package to convert the output into the .contig format.

The second type of information can be obtained from a [[#The .mates file|.mates file]], or from the database in case you are at TIGR.

A typical command line for someone outside of TIGR is:

goBambus -c test.contig -m test.mates -o test

and the execution will generate the following files:

* [[#.stats file|test.stats]] - statistics on scaffolds and libraries
* [[#.details file|test.details]] - detailed information about each contig pair (also called "gap").
* [[#.dot file|test.dot]] - GraphViz formatted description of the contig linkage information.
* [[#.evidence.xml|test.evidence.xml]] - XML representation of all the linking evidence provided.
* [[#.out.xml|test.out.xml]] - XML representation of the scaffolds.
* [[#.ps file|test.lib]] - list of the codes associated with each input library. These codes are reported on the links in the output file. 
* [[#.sum file|test.sum]] - one-line sumaries of all scaffolds.
* [[#.oo file|test.oo]] - order and orientation information for all the contigs.

Additionally, the program generates some "working" files that are currently not removed as they provide useful debugging information.

* test.detective.xml - linking information derived from assembler input alone. It's the same as test.evidence.xml unless additional XML files are provided in the input.
* test.inp - input to the core scaffolding engine.
* test.grommit.conf - configuration information for the scaffolding engine

To view the graphical output you need to convert it to Postscript with the command:

dot -Tps -o test.ps test.dot

Then you can view the postscript file with the command:

gv test.ps

At TIGR, instead of specifying a .mates file you can use the database:

goBambus -c test.contig -D gbx -U access -P access -o test

==== Getting more (or less) information from the output ====

You can modify the information you get in the output by directly calling the printScaff command. PrintScaff requires as inputs the [[#.evidence.xml|.evidence.xml]] file, the [[#.out.xml|.out.xml]] file and the [[#.lib|.lib]] files produce by Bambus, the minimal invocation being:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib

Optionally you may specify the output prefix with option -o (just like in the Bambus invocation). PrintScaff has parameters that control what gets reported, and in what format. In the first category you have:

* -dot - produce a .dot file
* -detail - produce a .details file
* -oo - produce a [[#.oo file|.oo]] file listing all the contigs in each scaffold
* -sum - produce a [[#.sum file|.sum]] tab delimited list of scaffold stats (#contigs, size, and span)
* -f <fasta_file> - generate a pseudo-molecule for each scaffold using the contig sequences listed in <fasta_file>. Note that the contigs in the scaffold files are names "contig_<id>" while the contigs in the <fasta_file> must simply be called "<id>". Note that this option together with the default -merge option (see below) is meaningless unless you've [[#Untangling scaffolds|untangled]] the scaffold.
* -phys - create a .phys file listing all "gaps" spanned only by the specified libraries:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib -phys MUMmer

will generate a file called test.phys that contains all contig pairs linked by nothing but MUMmer links.

In the second category you have:

* -page - produce .dot file formatted for printing on 8.5x11" paper
* -plot - produce .dot file formatted for printing on a plotter (36x48" paper)
* -unused - draw edges corresponding to unused links
* -merge - (default) when the -f option is given, produce a pseudo-molecule for each scaffold by adding 60 N characters between the contigs in the scaffold
* -nomerge - when the -f option is given, create a fasta file containing all contigs in each scaffold in the correct orientation

==== Adding additional XML linking information ====

You can easily add additional linking information as long as it's in a format similar to the [[#.evidence.xml file|.evidence.xml file]]. Assuming we have two such files "link1.xml" and "link2.xml" you can call Bambus as follows:

goBambus -c test.contig -m test.mates -x link1.xml -x link2.xml -o test

The result being the same as if you concatenated the XML file inferred from the .contig file together with the two additional XML files.

==== Using MUMmer links ====

In case you are sequencing a genome for which a close relative has been completed, you can use [[MUMmer]] to infer links between contigs. Assume you have the reference genome in a file "ref.fasta" and all the contigs from the genome you are assembling in a file called "test.fasta", you can run MUMmer to align the test genome to the reference:

nucmer -maxmatch ref.fasta test.fasta

The output will be placed in a file called out.delta. You can then convert the alignment information into an XML file suitable for Bambus as follows:

show-tiling -x out.delta > test.mum.xml

And finally you can provide this information to Bambus:

goBambus -c test.contig -m test.mates -x test.mum.xml -o test

==== Using Makefiles ====

If you have multiple XML files, it becomes tedious to generate each file separately and then provide them to Bambus with multiple -x options. You can create a Makefile compatible with GNU make that generates each of the XML files. You must be careful that the targets for all the XML files end in ".xml", and no other targets (such as intermediate targets) have the same suffix. Bambus will assume that all .xml files produced by the Makefile must be added to the input.

An example of a simple Makefile that can generate the mummer information described above is:

test.mum.xml: ref.fasta test.fasta
nucmer -a max-match ref.fasta test.fasta
show-tiling -x out.delta > test.mum.xml

==== Untangling scaffolds ====

By default, the scaffolds resulting from Bambus are potentially ambiguous as two or more contigs may occupy the same place in the genome. Such situations occur either due to misassembled repeats, or when assembling different haplotypes. Bambus contains a utility that does a best effort attempt to disambiguate the scaffolds, by breaking them into a set of non-ambiguous scaffolds. The algorithm is greedy and does not guarantee that an optimal solution (e.g. a minimum number of longest scaffolds) is produced.

To run the untangler type:

untangle -e test.evidence.xml -s test.out.xml -o test.untangle.xml

The test.untangle.xml file has the same format as the output file ([[#.out.xml|.out.xml]]) and for each scaffold (e.g. scaff_1) contains one or more unambiguous scaffolds called "scaff_1_1, scaff_1_2, etc". You can use the [[#Getting more (or less) information from the output|printScaff]] command to process the untangled file the same way you would process a normal Bambus output.

== File formats ==

=== The configuration file ===

Scaffolding parameters can be specified in a configuration file. Whenever you run Bambus a default configuration file is generated as default.conf. To create your own file you should rename this file then edit it. The following types of information can be specified:

* Comments. Lines starting with # are considered comments and are ignored by the software.

* Priority information. Specifies an order in which the libraries are considered. For example, the following line specifies priority 2 for all MUMmer links. The libraries are considered in the increasing order of priorities.

priority MUMmer 2

If no configuration file is provided, all libraries are assumed to have the same priority. If a configuration file is provided, only those libraries that have a priority record are used in scaffolding. In other words, if a library has no priority record associated with it, it will not be used in scaffolding.

A priority can also be specified for a specific redundancy parameter. For example, the following line specifies that those links between contigs supported by 4 or more sources of linking data should be processed at priority level 1.

priority redundancy 4 1

Note, however, that you need to also specify a set of libraries to be used at this priority level. A simple shortcut is specifying that all link classes should be processed at this redundancy.

priority ALL 1

If you are uncertain of which libraries your project uses, run Bambus without a configuration file. The libraries will be listed in a file ending in ".libs".

* Redundancy information. Specifies the minimum number of links required for a valid inter-contig link. You can specify this parameter as a global value:

redundancy 2

that is, all contigs must be linked by at least two links. You can also specify a per-library value like below which says a single MUMmer link is required to link two contigs.

redundancy MUMmer 1

* Minimum scaffold size. Specifies a scaffold size cutoff. For example thisimplies that only scaffolds that contain more than 10kbp are present in the output.:

mingroupsize 10000

* Link size error. For some link types, determining the exact link length can only be done within a certain error. This parameter specifies, as a percentage, the estimated error in size determination. For example, since MUMmer links make an assumption of evolutionary closeness between two genomes, the error associated with them should be proportional to the estimated evolutionary distance (as nucleotide % similarity) between the two genomes. For an error of 5% use this parameter:

error MUMmer 0.05

* Overlapping contigs allowed. For each library you can control the link validation process by specifying whether adjacent contigs are allowed to overlap or not. By default mate-pair links assume adjacent contigs do not overlap, while MUMmer links allow contigs to overlap. This is equivalent to the configuration parameters:

overlaps lib_1 N
overlaps MUMmer Y

An example is provided in [[bambus.conf|data/sample_files/bambus.conf]].

=== The .mates file ===

The .mates file provides two types of information: library data, and mate-pair relationships between reads.

Library data can be described in two formats:
library <name> <min_size> <max_size>

and
library <name> <min_size> <max_size> <regexp>

Both formats require you to name each library and to provide a size range for the inserts belonging to it. The second format allows you to also provide a Perl regular expression that describes the naming convention for reads belonging to the library. The part of the sequence name that represents the library name must be placed within parentheses. As an example, the regular expression for TIGR sequences (where the library is specified by the first 4 characters) is:
(....).*

Mate-pair relationships can also be described in two ways:
pair <regexp_forw> <regexp_rev>

or
<seq_forw_1> <seq_rev_1> <library_name>
<seq_forw_2> <seq_rev_2> <library_name>
<seq_forw_3> <seq_rev_3> <library_name>
...

The first format requires two regular expressions corresponding to the forward and reverse mates of an insert. Just like the library record, the portion of the name corresponding to the insert name must be placed within parentheses. Two reads that match the two regular expressions, and have the exact same section matched within the parentheses will be considered mates. The library will be determined from the regular expression associated with a "library" record. As an example, at TIGR the first 7 characters represent the insert, then are followed by an optional T or P and the primer name (F or R for Forward or Reverse). The corresponding regular expression is:
pair (.......)[TP]?F (.......)[TP]?R

The second format is simply a list of sequence name pairs each followed by the name of the library they belong to. The corresponding library records must be listed in the file before the pairing data.

'''Note that fields in the .mates file must be separated by TAB characters otherwise the program will report an error.'''

An example of a .mates file using the phred/phrap naming convention is provided in [[bambus.mates|data/sample_files/bambus.mates]].

=== Repeat files ===

The repeat files used in screening linking data conform to the output standards of the repeatFinder program. The file contains 5 TAB-delimited values: contig ID, repeat name, left and right coordinates within the contig, and contig class. Only the first 4 values are essential as far as Bambus is concerned. Here is an example of repeat records:

AC009139.7.3 RPT1A 14554 96820 1
AC026498.3.2 RPT1B 124208 206475 1

=== .evidence.xml ===

This provides an XML representation of the input file. Below is a description of the data presented in this file (see also an [[bambus.evidence.xml|example]]):

The overall flow of the XML file is:

<EVIDENCE>
<LIBRARY>
<INSERT>
<SEQUENCE/>
<SEQUENCE/>
</INSERT>
...
</LIBRARY>
...
<CONTIG>
<SEQUENCE/>
...
</CONTIG>
...
<LINK>
<CONTIG/>
<CONTIG/>
</LINK>
...
</EVIDENCE>

<EVIDENCE this is the parent tag - the whole document occurs within <EVIDENCE> and </EVIDENCE>
ID = "1" some identifier for the file
DATE = "12/15/03" date when file was created
PROJECT = "MyProject" verbose description of the project
PARAMETERS = "" parameters used in creating this file
>

<LIBRARY for each shotgun library you need a separate library tag.
ID = "lib_1" some identifier for the library
NAME = "short" verbose name for the library
MIN = "1200" minimum size of inserts in this library
MAX = "3500" maximum size of inserts in this library
>

<INSERT the insert concept links together the two reads obtained from opposite ends of a clone insert
ID="ins_1" generic identifier for the insert
NAME="GALBZ92" verbose name for the insert
>

<SEQUENCE each insert contains two sequence reads obtained from to the opposite ends
ID="seq_1" generic identifier for the sequence
NAME="GALBZ92TF" verbose name for the sequence
>

<CONTIG Each contig in the assembly needs to be represented in the file
ID="contig_1" contig identifier
NAME= "1" contig name (usually the same as the identifier)
LEN="12352" contig length in basepairs
>

<SEQUENCE each sequence in the contig is listed together with information about the position within the contig.
ID="seq_1" sequence identifier (must match one described in the library section)
ORI="BE" orientation of sequence: BE - forward, EB - reverse
ASM_LEND="0" coordinate of sequence's left end within the contig
ASM_REND="525" coordinate of sequence's right end within the contig
>

<LINK this is a generic link between two contigs (i.e. not inferred from mate-pair data)
ID = "link_1" link identifier
SIZE = "-800" size of gap between the two contigs
TYPE = "MUMmer" link type: links with the same type get grouped into a virtual "library" used in specifying priorities
>

<CONTIG each link contains two contigs
ID = "contig_1" contig identifier: must match one described in the contig section above
ORI = "EB" contig orientation: BE - forward, EB - reverse
> each contig record may contain free-form data specifying the evidence for linking: e.g. alignment data

=== .out.xml ===

Represents, together with the evidence file, a description of the layout of the contigs. For each scaffold, the order (given as a coordinate along a chromosome) and the orientation of each contig is presented, together with a list of all the links used to generate this layout. Some links are deemed invalid, being given a code of "LEN" in case the length was deemed incorrect, or "ORI" in case the link orientation was considered incorrect. This file is also specified by the [[bambus.dtd|DTD]] and an example ([[bambus.out.xml|data/sample_files/bambus.out.xml]]).

=== .stats file ===

Finally, BAMBUS outputs a summary of the scaffolds generated. This file is pretty much self-explanatory, as evidenced by this example ([[bambus.stats|data/sample_files/bambus.stats]]). Note that in the .stats file, the N50 sizes are computed with respect to the total span of the scaffolds unless a different genome size is specified in a file called genome.size.

=== .details file ===

The .details file contains detailed information about the linking information between adjacent contigs. The contigs' orientation, size, and coordinates are listed and then all the linking data grouped by validity and library. An example is provided in [[bambus.details|data/sample_files/bambus.details]].

=== .dot file ===

Represents a graphical representation of the scaffolds in GraphViz format. Please see the [http://www.research.att.com/sw/tools/graphviz/ AT&T GraphViz] website for more information on this file format.

=== .ps file ===

A postscript image generated from the GraphViz-formatted file. It can be obtained from the .dot file with the command:
dot -Tps -o prefix.ps prefix.dot

An example is given in the picture below.
[[Image:bambus-display.jpg]]

Each scaffold is placed in a box, labeled with some statistics on the scaffold size (number of contigs, number of bases and span). All the "gaps" - linking relationships between the contigs are represented as edges. Each edge is decorated with the number of links contributing to it, the number of links from each library type, and the number of links invalidated due to incorrect length (L) or orientation (O). Each contig's ID, size, and coordinates within the scaffold are also listed.

=== .oo file ===

This file contains a summary of the order and orientation of all contigs present in the data. Each scaffold starts with a FASTA-like header containing the identifier for the scaffold followed by the number of contigs, size and span. Within each scaffold, all contigs are listed in the scaffold order, followed by the string BE for those in the forward orientation and EB for those in the reverse orientation. An example file is shown in: [[bambus.oo|data/sample_files/bambus.oo]].

=== .sum file ===

This file contains one line for each scaffold. Each line contains the scaffold ID, number of contigs, scaffold size, and span. The four values are separated by TAB characters. See an example in [[bambus.sum|data/sample_files/bambus.sum]].

== Known problems ==
There is a small "off-by-one" error in computing contig coordinates. It should not really affect the usefulness of the output.

== Contact information ==

BAMBUS is currently provided AS-IS, in other words we do not provide any support for the software. We would, however, like to hear your comments and suggestions. For Bambus bug reports, support requests, or any other inquiries please browse our SourceForge project page or Email us at:

amos-help (at) lists (dot) sourceforge (dot) net

BAMBUS was written by Mihai Pop and Dan Kosack.

Bambus Manual

2010-12-16T15:14:24Z

Dmb000006: /* Getting more (or less) information from the output */

{| align="right"
| __TOC__
|}

The output of most shotgun sequence assembly programs (such as TIGR Assembler, phrap, or CAP3) consists in a set of un-related contigs, whose order and orientation along the chromosome is unknown. Scaffolding represents the task of ordering and orienting these contigs by using additional information about their relative placement. Traditionally such information was identified from the pairing of reads from the opposite ends of an insert in double-barelled shotgun experiments (see figure).

[[Image:scaffold image.gif]]

Besides clone mate information, homology data, physical maps, or gene synteny information can be used to derive relationships between contigs. BAMBUS is a program written to handle all such types of data in a generic fashion in order to build contig scaffolds. In order to account for the varied quality of linking information BAMBUS allows the user to specify a hierarchy that will consider the most reliable linking data first, then expand the scaffolds using less reliable data. This approach minimizes the effect of errors inherent to experimentally-derived linking information.

=== Algorithm overview ===

BAMBUS has two main modes of operation: hierarchical - when the links are considered in the order of their priorities, and standard - when all the links are considered at the same priority level. In the latter case, all links connecting two contigs are bundled together to form a "gap", or "edge" if you look at the problem from a graph theoretical point of view. Each link is checked for validity before being added to a bundle. Thus, a link is considered invalid due to a length constraint if it forces the two contigs to overlap. In other words, if we assume the largest possible size for the insert corresponding to the link, if the coordinates of the mate-pair within the contigs force the contigs to overlap, we discard the link assuming it is due to a misassembly (Note: this behaviour can be specified on a link-by-link basis). Links that pass this test are further checked for consistency. If they do not all agree in the relative orientations of the contigs they imply, a majority rule is used to retain only those links that agree both in orientation and length. Furthermore, the bundle, or edge, is retained if it contains at least two links (this parameter, redundancy, can actually be tuned by the user). After this step is complete, the resulting graph gets traversed twice, once to assign consistent orientations to all the contigs, and the second time to determine the order of the contigs along the chromosome. Note that inconsistencies in order are currently allowed since they can provide finishing teams with useful information. In a future version of BAMBUS we will add an option to allow the generation of unambiguous scaffolds as well. Currently you can generate such unambiguous scaffolds using the [[#Untangling scaffolds|untangle]] program.

== Installation ==

=== Prerequisites ===

* Install AMOS
* Perl 5.6 or later
* [http://www.cpan.org/ XML::Parser] perl module
* [http://www.cpan.org/ Config::IniFiles] perl module
* [http://www.research.att.com/sw/tools/graphviz/ GraphViz] package

=== Obtaining BAMBUS ===

BAMBUS 2.3 is available free of charge under the open-source Artistic License.

The Bambus source if freely available for download from the File Release Section of our SourceForge project page.

To receive information regarding new releases and developments, please subscribe to our moderated, low-traffic users' mailing list:

amos-users(at)lists(dot)sourceforge(dot)net

=== Actual installation ===

1. Find a directory where you want the software installed. For example /users/home/CoolUser

2. Place the tar file bambus-2.33.tar.gz in this directory

3. Unpack the distribution
% tar xvzf bambus-2.33.tar.gz

4. Go into the newly created directory
% cd bambus-2.33

5. Carefully read the documentation

6. Edit the file Makefile and change the value of BASEDIR to the correct installation path, for example:
BASEDIR = /users/home/CoolUser

7. Also check if the path for perl (in variable PERL) matches the one on your system. Several common options are:
PERL = /usr/local/bin/perl
PERL = /usr/bin/perl

8. Type 'gmake install' to have all the software installed. The executables will be in /users/home/CoolUser/bin/ and the documentation in /users/home/CoolUser/doc.

9. To get started you can try out the small test dataset provided in data/test_run by typing:

goBambus -c test.contig -m test.mates -o test-bambus

== Running BAMBUS ==

The input to Bambus consists in a set of links between contigs. These links can be inferred from the pairing of reads belonging to the same insert, from physical map data, or from alignment to another genome. To accomodate these types of sources of linking information, and to allow for further extensions, the input to Bambus is presented in a general purpose XML format.

The following sections describe how the XML file can be generated for each type of linking data. For more information see [[#.evidence.xml|.evidence.xml]].

Linking information is grouped into "libraries". All links within a library have similar parameters. In the case of mate-pair linking information the libraries match the standard definition. For other linking data, for example MUMmer links, the library grouping reflects specific characteristics of the linking data. For example all MUMmer links are grouped within a library called "MUMmer".

=== Command line options ===

Bambus consists in a collection of programs controlled by a script called goBambus. The execution is controlled by the following sets of parameters. To obtain a list of all parameters and basic help information you need to type:

goBambus -h
or
goBambus -help

==== Configuration options ====

* -C <conf_file> . This parameter specifies the configuration file used by Bambus. A configuration file is not required, however it is useful if you want to change the default parameters. See [[#The configuration file|the configuration file]] for a detailed description of the configuration information.

Whenever you run Bambus it will generate a default configuration file called default.conf. If you want to modify the parameters, you will need to rename this file and then edit it.

==== Input Options ====

* -a <asm_file> . Use a TIGR Assembler .asm file as source of linking information.
* -c <contig_file> . Use a GDE formatted .contig assembly file as source of linking information. The .contig files can be obtained either from TIGR Assembler output (by concatenating the contents of the .align directory) or by converting .ace files (created by phrap or Consed) using the [[ace2contig]] package.
* -x <xml_file>. Use additional XML formatted linking information. You can use the -x option multiple times. See [[#.evidence.xml|.evidence.xml]] for the XML file format.
* -mx <make_file>. Use a gmake compatible make file to specify how the additional XML information is generated. Each of the files created must end in ".xml". Moreover, only the final XML targets are allowed to end in ".xml" to prevent goBambus from getting confused. See [[#Using Makefiles|using Makefiles]] for a more detailed description on using makefiles.

==== Mate-pair Information ====

Assembly output (such as .asm and .contig files) does not generally contain any information about the pairing of reads coming from opposite ends of the same insert, nor about the membership of inserts to libraries. The following parameters specify how to obtain this mate-pair information.

* -m <mates_file> . The mates file allows you to specify both library information and mate pairing information, using a system based on Perl regular expressions. See [[#The .mates file|the .mates file]].
* -D <database> . At TIGR, instead of a mates file you can opt to use a database to obtain library and mate information. Note that the -m and -D options are complementary. The following three parameters specify database options:
* -S <server> . Select a specific database server
* -U <user> . Log in with the specified user name
* -P <passwd> . Log in with the specified password

==== Output Options ====

* -o <output_prefix> . All the output file names will be generated from the <output_prefix> prefix.

==== Repeat Screening ====

Misassembled repeats can confuse Bambus, therefore it makes sense to screen the known repeats that may be misassembled. Bambus can use two types of repeat screening information:

* -r . Screen against the repeats listed in the database (option -D must be provided)
* -r <repfile> . Screen against repeats described in [[#Repeat files|repeat files]].

==== Flow-control ====

* -start <num>
* -end <num>. The execution of Bambus consists in a set of steps. The user can choose to start or end at a specific step for example in the case when she modifies the configuration file and wants to re-run just the final steps of the scaffolders. This feature should be used with caution.

=== Typical use cases ===
==== Using assembler output ====

For convenience, Bambus can use the output of an assembly program and automatically generate the required linking information. Two types of information are required:

* the tiling of reads in the assembly
* information about the pairing of reads

The first type of information can be obtained from the .asm or .contig output formats of TIGR Assembler. For more information about these file formats please refer to the [http://www.jcvi.org/cms/publications/listing/abstract/article/tigr-assembler-a-new-tool-for-assembling-large-shotgun-sequencing-projects/ TIGR Assembler documentation]. Note that the .contig file may be stripped of all sequence data as the only information used is that contained in the lines starting with #. In case you are using an assembler other than TIGR Assembler you may need to use the ta2ace package to convert the output into the .contig format.

The second type of information can be obtained from a [[#The .mates file|.mates file]], or from the database in case you are at TIGR.

A typical command line for someone outside of TIGR is:

goBambus -c test.contig -m test.mates -o test

and the execution will generate the following files:

* [[#.stats file|test.stats]] - statistics on scaffolds and libraries
* [[#.details file|test.details]] - detailed information about each contig pair (also called "gap").
* [[#.dot file|test.dot]] - GraphViz formatted description of the contig linkage information.
* [[#.evidence.xml|test.evidence.xml]] - XML representation of all the linking evidence provided.
* [[#.out.xml|test.out.xml]] - XML representation of the scaffolds.
* [[#.ps file|test.lib]] - list of the codes associated with each input library. These codes are reported on the links in the output file.
* [[#.sum file|test.sum]] - one-line sumaries of all scaffolds.
* [[#.oo file|test.oo]] - order and orientation information for all the contigs.

Additionally, the program generates some "working" files that are currently not removed as they provide useful debugging information.

* test.detective.xml - linking information derived from assembler input alone. It's the same as test.evidence.xml unless additional XML files are provided in the input.
* test.inp - input to the core scaffolding engine.
* test.grommit.conf - configuration information for the scaffolding engine

To view the graphical output you need to convert it to Postscript with the command:

dot -Tps -o test.ps test.dot

Then you can view the postscript file with the command:

gv test.ps

At TIGR, instead of specifying a .mates file you can use the database:

goBambus -c test.contig -D gbx -U access -P access -o test

==== Getting more (or less) information from the output ====

You can modify the information you get in the output by directly calling the printScaff command. PrintScaff requires as inputs the [[#.evidence.xml|.evidence.xml]] file, the [[#.out.xml|.out.xml]] file and the [[#.lib|.lib]] files produce by Bambus, the minimal invocation being:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib

Optionally you may specify the output prefix with option -o (just like in the Bambus invocation). PrintScaff has parameters that control what gets reported, and in what format. In the first category you have:

* -dot - produce a .dot file
* -detail - produce a .details file
* -oo - produce a [[#.oo file|.oo]] file listing all the contigs in each scaffold
* -sum - produce a [[#.sum file|.sum]] tab delimited list of scaffold stats (#contigs, size, and span)
* -f <fasta_file> - generate a pseudo-molecule for each scaffold using the contig sequences listed in <fasta_file>. Note that the contigs in the scaffold files are names "contig_<id>" while the contigs in the <fasta_file> must simply be called "<id>". Note that this option together with the default -merge option (see below) is meaningless unless you've [[#Untangling scaffolds|untangled]] the scaffold.
* -phys - create a .phys file listing all "gaps" spanned only by the specified libraries:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib -phys MUMmer

will generate a file called test.phys that contains all contig pairs linked by nothing but MUMmer links.

In the second category you have:

* -page - produce .dot file formatted for printing on 8.5x11" paper
* -plot - produce .dot file formatted for printing on a plotter (36x48" paper)
* -unused - draw edges corresponding to unused links
* -merge - (default) when the -f option is given, produce a pseudo-molecule for each scaffold by adding 60 N characters between the contigs in the scaffold
* -nomerge - when the -f option is given, create a fasta file containing all contigs in each scaffold in the correct orientation

==== Adding additional XML linking information ====

You can easily add additional linking information as long as it's in a format similar to the [[#.evidence.xml file|.evidence.xml file]]. Assuming we have two such files "link1.xml" and "link2.xml" you can call Bambus as follows:

goBambus -c test.contig -m test.mates -x link1.xml -x link2.xml -o test

The result being the same as if you concatenated the XML file inferred from the .contig file together with the two additional XML files.

==== Using MUMmer links ====

In case you are sequencing a genome for which a close relative has been completed, you can use [[MUMmer]] to infer links between contigs. Assume you have the reference genome in a file "ref.fasta" and all the contigs from the genome you are assembling in a file called "test.fasta", you can run MUMmer to align the test genome to the reference:

nucmer -maxmatch ref.fasta test.fasta

The output will be placed in a file called out.delta. You can then convert the alignment information into an XML file suitable for Bambus as follows:

show-tiling -x out.delta > test.mum.xml

And finally you can provide this information to Bambus:

goBambus -c test.contig -m test.mates -x test.mum.xml -o test

==== Using Makefiles ====

If you have multiple XML files, it becomes tedious to generate each file separately and then provide them to Bambus with multiple -x options. You can create a Makefile compatible with GNU make that generates each of the XML files. You must be careful that the targets for all the XML files end in ".xml", and no other targets (such as intermediate targets) have the same suffix. Bambus will assume that all .xml files produced by the Makefile must be added to the input.

An example of a simple Makefile that can generate the mummer information described above is:

test.mum.xml: ref.fasta test.fasta
nucmer -a max-match ref.fasta test.fasta
show-tiling -x out.delta > test.mum.xml

==== Untangling scaffolds ====

By default, the scaffolds resulting from Bambus are potentially ambiguous as two or more contigs may occupy the same place in the genome. Such situations occur either due to misassembled repeats, or when assembling different haplotypes. Bambus contains a utility that does a best effort attempt to disambiguate the scaffolds, by breaking them into a set of non-ambiguous scaffolds. The algorithm is greedy and does not guarantee that an optimal solution (e.g. a minimum number of longest scaffolds) is produced.

To run the untangler type:

untangle -e test.evidence.xml -s test.out.xml -o test.untangle.xml

The test.untangle.xml file has the same format as the output file ([[#.out.xml|.out.xml]]) and for each scaffold (e.g. scaff_1) contains one or more unambiguous scaffolds called "scaff_1_1, scaff_1_2, etc". You can use the [[#Getting more (or less) information from the output|printScaff]] command to process the untangled file the same way you would process a normal Bambus output.

== File formats ==

=== The configuration file ===

Scaffolding parameters can be specified in a configuration file. Whenever you run Bambus a default configuration file is generated as default.conf. To create your own file you should rename this file then edit it. The following types of information can be specified:

* Comments. Lines starting with # are considered comments and are ignored by the software.

* Priority information. Specifies an order in which the libraries are considered. For example, the following line specifies priority 2 for all MUMmer links. The libraries are considered in the increasing order of priorities.

priority MUMmer 2

If no configuration file is provided, all libraries are assumed to have the same priority. If a configuration file is provided, only those libraries that have a priority record are used in scaffolding. In other words, if a library has no priority record associated with it, it will not be used in scaffolding.

A priority can also be specified for a specific redundancy parameter. For example, the following line specifies that those links between contigs supported by 4 or more sources of linking data should be processed at priority level 1.

priority redundancy 4 1

Note, however, that you need to also specify a set of libraries to be used at this priority level. A simple shortcut is specifying that all link classes should be processed at this redundancy.

priority ALL 1

If you are uncertain of which libraries your project uses, run Bambus without a configuration file. The libraries will be listed in a file ending in ".libs".

* Redundancy information. Specifies the minimum number of links required for a valid inter-contig link. You can specify this parameter as a global value:

redundancy 2

that is, all contigs must be linked by at least two links. You can also specify a per-library value like below which says a single MUMmer link is required to link two contigs.

redundancy MUMmer 1

* Minimum scaffold size. Specifies a scaffold size cutoff. For example thisimplies that only scaffolds that contain more than 10kbp are present in the output.:

mingroupsize 10000

* Link size error. For some link types, determining the exact link length can only be done within a certain error. This parameter specifies, as a percentage, the estimated error in size determination. For example, since MUMmer links make an assumption of evolutionary closeness between two genomes, the error associated with them should be proportional to the estimated evolutionary distance (as nucleotide % similarity) between the two genomes. For an error of 5% use this parameter:

error MUMmer 0.05

* Overlapping contigs allowed. For each library you can control the link validation process by specifying whether adjacent contigs are allowed to overlap or not. By default mate-pair links assume adjacent contigs do not overlap, while MUMmer links allow contigs to overlap. This is equivalent to the configuration parameters:

overlaps lib_1 N
overlaps MUMmer Y

An example is provided in [[bambus.conf|data/sample_files/bambus.conf]].

=== The .mates file ===

The .mates file provides two types of information: library data, and mate-pair relationships between reads.

Library data can be described in two formats:
library <name> <min_size> <max_size>

and
library <name> <min_size> <max_size> <regexp>

Both formats require you to name each library and to provide a size range for the inserts belonging to it. The second format allows you to also provide a Perl regular expression that describes the naming convention for reads belonging to the library. The part of the sequence name that represents the library name must be placed within parentheses. As an example, the regular expression for TIGR sequences (where the library is specified by the first 4 characters) is:
(....).*

Mate-pair relationships can also be described in two ways:
pair <regexp_forw> <regexp_rev>

or
<seq_forw_1> <seq_rev_1> <library_name>
<seq_forw_2> <seq_rev_2> <library_name>
<seq_forw_3> <seq_rev_3> <library_name>
...

The first format requires two regular expressions corresponding to the forward and reverse mates of an insert. Just like the library record, the portion of the name corresponding to the insert name must be placed within parentheses. Two reads that match the two regular expressions, and have the exact same section matched within the parentheses will be considered mates. The library will be determined from the regular expression associated with a "library" record. As an example, at TIGR the first 7 characters represent the insert, then are followed by an optional T or P and the primer name (F or R for Forward or Reverse). The corresponding regular expression is:
pair (.......)[TP]?F (.......)[TP]?R

The second format is simply a list of sequence name pairs each followed by the name of the library they belong to. The corresponding library records must be listed in the file before the pairing data.

'''Note that fields in the .mates file must be separated by TAB characters otherwise the program will report an error.'''

An example of a .mates file using the phred/phrap naming convention is provided in [[bambus.mates|data/sample_files/bambus.mates]].

=== Repeat files ===

The repeat files used in screening linking data conform to the output standards of the repeatFinder program. The file contains 5 TAB-delimited values: contig ID, repeat name, left and right coordinates within the contig, and contig class. Only the first 4 values are essential as far as Bambus is concerned. Here is an example of repeat records:

AC009139.7.3 RPT1A 14554 96820 1
AC026498.3.2 RPT1B 124208 206475 1

=== .evidence.xml ===

This provides an XML representation of the input file. Below is a description of the data presented in this file (see also an [[bambus.evidence.xml|example]]):

The overall flow of the XML file is:

<EVIDENCE>
<LIBRARY>
<INSERT>
<SEQUENCE/>
<SEQUENCE/>
</INSERT>
...
</LIBRARY>
...
<CONTIG>
<SEQUENCE/>
...
</CONTIG>
...
<LINK>
<CONTIG/>
<CONTIG/>
</LINK>
...
</EVIDENCE>

<EVIDENCE this is the parent tag - the whole document occurs within <EVIDENCE> and </EVIDENCE>
ID = "1" some identifier for the file
DATE = "12/15/03" date when file was created
PROJECT = "MyProject" verbose description of the project
PARAMETERS = "" parameters used in creating this file
>

<LIBRARY for each shotgun library you need a separate library tag.
ID = "lib_1" some identifier for the library
NAME = "short" verbose name for the library
MIN = "1200" minimum size of inserts in this library
MAX = "3500" maximum size of inserts in this library
>

<INSERT the insert concept links together the two reads obtained from opposite ends of a clone insert
ID="ins_1" generic identifier for the insert
NAME="GALBZ92" verbose name for the insert
>

<SEQUENCE each insert contains two sequence reads obtained from to the opposite ends
ID="seq_1" generic identifier for the sequence
NAME="GALBZ92TF" verbose name for the sequence
>

<CONTIG Each contig in the assembly needs to be represented in the file
ID="contig_1" contig identifier
NAME= "1" contig name (usually the same as the identifier)
LEN="12352" contig length in basepairs
>

<SEQUENCE each sequence in the contig is listed together with information about the position within the contig.
ID="seq_1" sequence identifier (must match one described in the library section)
ORI="BE" orientation of sequence: BE - forward, EB - reverse
ASM_LEND="0" coordinate of sequence's left end within the contig
ASM_REND="525" coordinate of sequence's right end within the contig
>

<LINK this is a generic link between two contigs (i.e. not inferred from mate-pair data)
ID = "link_1" link identifier
SIZE = "-800" size of gap between the two contigs
TYPE = "MUMmer" link type: links with the same type get grouped into a virtual "library" used in specifying priorities
>

<CONTIG each link contains two contigs
ID = "contig_1" contig identifier: must match one described in the contig section above
ORI = "EB" contig orientation: BE - forward, EB - reverse
> each contig record may contain free-form data specifying the evidence for linking: e.g. alignment data

=== .out.xml ===

Represents, together with the evidence file, a description of the layout of the contigs. For each scaffold, the order (given as a coordinate along a chromosome) and the orientation of each contig is presented, together with a list of all the links used to generate this layout. Some links are deemed invalid, being given a code of "LEN" in case the length was deemed incorrect, or "ORI" in case the link orientation was considered incorrect. This file is also specified by the [[bambus.dtd|DTD]] and an example ([[bambus.out.xml|data/sample_files/bambus.out.xml]]).

=== .stats file ===

Finally, BAMBUS outputs a summary of the scaffolds generated. This file is pretty much self-explanatory, as evidenced by this example ([[bambus.stats|data/sample_files/bambus.stats]]). Note that in the .stats file, the N50 sizes are computed with respect to the total span of the scaffolds unless a different genome size is specified in a file called genome.size.

=== .details file ===

The .details file contains detailed information about the linking information between adjacent contigs. The contigs' orientation, size, and coordinates are listed and then all the linking data grouped by validity and library. An example is provided in [[bambus.details|data/sample_files/bambus.details]].

=== .dot file ===

Represents a graphical representation of the scaffolds in GraphViz format. Please see the [http://www.research.att.com/sw/tools/graphviz/ AT&T GraphViz] website for more information on this file format.

=== .ps file ===

A postscript image generated from the GraphViz-formatted file. It can be obtained from the .dot file with the command:
dot -Tps -o prefix.ps prefix.dot

An example is given in the picture below.
[[Image:bambus-display.jpg]]

Each scaffold is placed in a box, labeled with some statistics on the scaffold size (number of contigs, number of bases and span). All the "gaps" - linking relationships between the contigs are represented as edges. Each edge is decorated with the number of links contributing to it, the number of links from each library type, and the number of links invalidated due to incorrect length (L) or orientation (O). Each contig's ID, size, and coordinates within the scaffold are also listed.

=== .oo file ===

This file contains a summary of the order and orientation of all contigs present in the data. Each scaffold starts with a FASTA-like header containing the identifier for the scaffold followed by the number of contigs, size and span. Within each scaffold, all contigs are listed in the scaffold order, followed by the string BE for those in the forward orientation and EB for those in the reverse orientation. An example file is shown in: [[bambus.oo|data/sample_files/bambus.oo]].

=== .sum file ===

This file contains one line for each scaffold. Each line contains the scaffold ID, number of contigs, scaffold size, and span. The four values are separated by TAB characters. See an example in [[bambus.sum|data/sample_files/bambus.sum]].

== Known problems ==
There is a small "off-by-one" error in computing contig coordinates. It should not really affect the usefulness of the output.

== Contact information ==

BAMBUS is currently provided AS-IS, in other words we do not provide any support for the software. We would, however, like to hear your comments and suggestions. For Bambus bug reports, support requests, or any other inquiries please browse our SourceForge project page or Email us at:

amos-help (at) lists (dot) sourceforge (dot) net

BAMBUS was written by Mihai Pop and Dan Kosack.

Bambus Manual

2010-12-16T15:08:50Z

Dmb000006: /* Getting more (or less) information from the output */ I'm fairly sure that that was a formatting error

{| align="right"
| __TOC__
|}

The output of most shotgun sequence assembly programs (such as TIGR Assembler, phrap, or CAP3) consists in a set of un-related contigs, whose order and orientation along the chromosome is unknown. Scaffolding represents the task of ordering and orienting these contigs by using additional information about their relative placement. Traditionally such information was identified from the pairing of reads from the opposite ends of an insert in double-barelled shotgun experiments (see figure).

[[Image:scaffold image.gif]]

Besides clone mate information, homology data, physical maps, or gene synteny information can be used to derive relationships between contigs. BAMBUS is a program written to handle all such types of data in a generic fashion in order to build contig scaffolds. In order to account for the varied quality of linking information BAMBUS allows the user to specify a hierarchy that will consider the most reliable linking data first, then expand the scaffolds using less reliable data. This approach minimizes the effect of errors inherent to experimentally-derived linking information.

=== Algorithm overview ===

BAMBUS has two main modes of operation: hierarchical - when the links are considered in the order of their priorities, and standard - when all the links are considered at the same priority level. In the latter case, all links connecting two contigs are bundled together to form a "gap", or "edge" if you look at the problem from a graph theoretical point of view. Each link is checked for validity before being added to a bundle. Thus, a link is considered invalid due to a length constraint if it forces the two contigs to overlap. In other words, if we assume the largest possible size for the insert corresponding to the link, if the coordinates of the mate-pair within the contigs force the contigs to overlap, we discard the link assuming it is due to a misassembly (Note: this behaviour can be specified on a link-by-link basis). Links that pass this test are further checked for consistency. If they do not all agree in the relative orientations of the contigs they imply, a majority rule is used to retain only those links that agree both in orientation and length. Furthermore, the bundle, or edge, is retained if it contains at least two links (this parameter, redundancy, can actually be tuned by the user). After this step is complete, the resulting graph gets traversed twice, once to assign consistent orientations to all the contigs, and the second time to determine the order of the contigs along the chromosome. Note that inconsistencies in order are currently allowed since they can provide finishing teams with useful information. In a future version of BAMBUS we will add an option to allow the generation of unambiguous scaffolds as well. Currently you can generate such unambiguous scaffolds using the [[#Untangling scaffolds|untangle]] program.

== Installation ==

=== Prerequisites ===

* Install AMOS
* Perl 5.6 or later
* [http://www.cpan.org/ XML::Parser] perl module
* [http://www.cpan.org/ Config::IniFiles] perl module
* [http://www.research.att.com/sw/tools/graphviz/ GraphViz] package

=== Obtaining BAMBUS ===

BAMBUS 2.3 is available free of charge under the open-source Artistic License.

The Bambus source if freely available for download from the File Release Section of our SourceForge project page.

To receive information regarding new releases and developments, please subscribe to our moderated, low-traffic users' mailing list:

amos-users(at)lists(dot)sourceforge(dot)net

=== Actual installation ===

1. Find a directory where you want the software installed. For example /users/home/CoolUser

2. Place the tar file bambus-2.33.tar.gz in this directory

3. Unpack the distribution
% tar xvzf bambus-2.33.tar.gz

4. Go into the newly created directory
% cd bambus-2.33

5. Carefully read the documentation

6. Edit the file Makefile and change the value of BASEDIR to the correct installation path, for example:
BASEDIR = /users/home/CoolUser

7. Also check if the path for perl (in variable PERL) matches the one on your system. Several common options are:
PERL = /usr/local/bin/perl
PERL = /usr/bin/perl

8. Type 'gmake install' to have all the software installed. The executables will be in /users/home/CoolUser/bin/ and the documentation in /users/home/CoolUser/doc.

9. To get started you can try out the small test dataset provided in data/test_run by typing:

goBambus -c test.contig -m test.mates -o test-bambus

== Running BAMBUS ==

The input to Bambus consists in a set of links between contigs. These links can be inferred from the pairing of reads belonging to the same insert, from physical map data, or from alignment to another genome. To accomodate these types of sources of linking information, and to allow for further extensions, the input to Bambus is presented in a general purpose XML format.

The following sections describe how the XML file can be generated for each type of linking data. For more information see [[#.evidence.xml|.evidence.xml]].

Linking information is grouped into "libraries". All links within a library have similar parameters. In the case of mate-pair linking information the libraries match the standard definition. For other linking data, for example MUMmer links, the library grouping reflects specific characteristics of the linking data. For example all MUMmer links are grouped within a library called "MUMmer".

=== Command line options ===

Bambus consists in a collection of programs controlled by a script called goBambus. The execution is controlled by the following sets of parameters. To obtain a list of all parameters and basic help information you need to type:

goBambus -h
or
goBambus -help

==== Configuration options ====

* -C <conf_file> . This parameter specifies the configuration file used by Bambus. A configuration file is not required, however it is useful if you want to change the default parameters. See [[#The configuration file|the configuration file]] for a detailed description of the configuration information.

Whenever you run Bambus it will generate a default configuration file called default.conf. If you want to modify the parameters, you will need to rename this file and then edit it.

==== Input Options ====

* -a <asm_file> . Use a TIGR Assembler .asm file as source of linking information.
* -c <contig_file> . Use a GDE formatted .contig assembly file as source of linking information. The .contig files can be obtained either from TIGR Assembler output (by concatenating the contents of the .align directory) or by converting .ace files (created by phrap or Consed) using the [[ace2contig]] package.
* -x <xml_file>. Use additional XML formatted linking information. You can use the -x option multiple times. See [[#.evidence.xml|.evidence.xml]] for the XML file format.
* -mx <make_file>. Use a gmake compatible make file to specify how the additional XML information is generated. Each of the files created must end in ".xml". Moreover, only the final XML targets are allowed to end in ".xml" to prevent goBambus from getting confused. See [[#Using Makefiles|using Makefiles]] for a more detailed description on using makefiles.

==== Mate-pair Information ====

Assembly output (such as .asm and .contig files) does not generally contain any information about the pairing of reads coming from opposite ends of the same insert, nor about the membership of inserts to libraries. The following parameters specify how to obtain this mate-pair information.

* -m <mates_file> . The mates file allows you to specify both library information and mate pairing information, using a system based on Perl regular expressions. See [[#The .mates file|the .mates file]].
* -D <database> . At TIGR, instead of a mates file you can opt to use a database to obtain library and mate information. Note that the -m and -D options are complementary. The following three parameters specify database options:
* -S <server> . Select a specific database server
* -U <user> . Log in with the specified user name
* -P <passwd> . Log in with the specified password

==== Output Options ====

* -o <output_prefix> . All the output file names will be generated from the <output_prefix> prefix.

==== Repeat Screening ====

Misassembled repeats can confuse Bambus, therefore it makes sense to screen the known repeats that may be misassembled. Bambus can use two types of repeat screening information:

* -r . Screen against the repeats listed in the database (option -D must be provided)
* -r <repfile> . Screen against repeats described in [[#Repeat files|repeat files]].

==== Flow-control ====

* -start <num>
* -end <num>. The execution of Bambus consists in a set of steps. The user can choose to start or end at a specific step for example in the case when she modifies the configuration file and wants to re-run just the final steps of the scaffolders. This feature should be used with caution.

=== Typical use cases ===
==== Using assembler output ====

For convenience, Bambus can use the output of an assembly program and automatically generate the required linking information. Two types of information are required:

* the tiling of reads in the assembly
* information about the pairing of reads

The first type of information can be obtained from the .asm or .contig output formats of TIGR Assembler. For more information about these file formats please refer to the [http://www.jcvi.org/cms/publications/listing/abstract/article/tigr-assembler-a-new-tool-for-assembling-large-shotgun-sequencing-projects/ TIGR Assembler documentation]. Note that the .contig file may be stripped of all sequence data as the only information used is that contained in the lines starting with #. In case you are using an assembler other than TIGR Assembler you may need to use the ta2ace package to convert the output into the .contig format.

The second type of information can be obtained from a [[#The .mates file|.mates file]], or from the database in case you are at TIGR.

A typical command line for someone outside of TIGR is:

goBambus -c test.contig -m test.mates -o test

and the execution will generate the following files:

* [[#.stats file|test.stats]] - statistics on scaffolds and libraries
* [[#.details file|test.details]] - detailed information about each contig pair (also called "gap").
* [[#.dot file|test.dot]] - GraphViz formatted description of the contig linkage information.
* [[#.evidence.xml|test.evidence.xml]] - XML representation of all the linking evidence provided.
* [[#.out.xml|test.out.xml]] - XML representation of the scaffolds.
* [[#.ps file|test.lib]] - list of the codes associated with each input library. These codes are reported on the links in the output file.
* [[#.sum file|test.sum]] - one-line sumaries of all scaffolds.
* [[#.oo file|test.oo]] - order and orientation information for all the contigs.

Additionally, the program generates some "working" files that are currently not removed as they provide useful debugging information.

* test.detective.xml - linking information derived from assembler input alone. It's the same as test.evidence.xml unless additional XML files are provided in the input.
* test.inp - input to the core scaffolding engine.
* test.grommit.conf - configuration information for the scaffolding engine

To view the graphical output you need to convert it to Postscript with the command:

dot -Tps -o test.ps test.dot

Then you can view the postscript file with the command:

gv test.ps

At TIGR, instead of specifying a .mates file you can use the database:

goBambus -c test.contig -D gbx -U access -P access -o test

==== Getting more (or less) information from the output ====

You can modify the information you get in the output by directly calling the printScaff command. PrintScaff requires as inputs the .evidence.xml file, the .out.xml file and the .lib files produce by Bambus, the minimal invocation being:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib

Optionally you may specify the output prefix with option -o (just like in the Bambus invocation). PrintScaff has parameters that control what gets reported, and in what format. In the first category you have:

* -dot - produce a .dot file
* -detail - produce a .details file
* -oo - produce a [[#.oo file|.oo]] file listing all the contigs in each scaffold
* -sum - produce a [[#.sum file|.sum]] tab delimited list of scaffold stats (#contigs, size, and span)
* -f <fasta_file> - generate a pseudo-molecule for each scaffold using the contig sequences listed in <fasta_file>. Note that the contigs in the scaffold files are names "contig_<id>" while the contigs in the <fasta_file> must simply be called "<id>". Note that this option together with the default -merge option (see below) is meaningless unless you've [[#Untangling scaffolds|untangled]] the scaffold.
* -phys - create a .phys file listing all "gaps" spanned only by the specified libraries:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib -phys MUMmer

will generate a file called test.phys that contains all contig pairs linked by nothing but MUMmer links.

In the second category you have:

* -page - produce .dot file formatted for printing on 8.5x11" paper
* -plot - produce .dot file formatted for printing on a plotter (36x48" paper)
* -unused - draw edges corresponding to unused links
* -merge - (default) when the -f option is given, produce a pseudo-molecule for each scaffold by adding 60 N characters between the contigs in the scaffold
* -nomerge - when the -f option is given, create a fasta file containing all contigs in each scaffold in the correct orientation

==== Adding additional XML linking information ====

You can easily add additional linking information as long as it's in a format similar to the [[#.evidence.xml file|.evidence.xml file]]. Assuming we have two such files "link1.xml" and "link2.xml" you can call Bambus as follows:

goBambus -c test.contig -m test.mates -x link1.xml -x link2.xml -o test

The result being the same as if you concatenated the XML file inferred from the .contig file together with the two additional XML files.

==== Using MUMmer links ====

In case you are sequencing a genome for which a close relative has been completed, you can use [[MUMmer]] to infer links between contigs. Assume you have the reference genome in a file "ref.fasta" and all the contigs from the genome you are assembling in a file called "test.fasta", you can run MUMmer to align the test genome to the reference:

nucmer -maxmatch ref.fasta test.fasta

The output will be placed in a file called out.delta. You can then convert the alignment information into an XML file suitable for Bambus as follows:

show-tiling -x out.delta > test.mum.xml

And finally you can provide this information to Bambus:

goBambus -c test.contig -m test.mates -x test.mum.xml -o test

==== Using Makefiles ====

If you have multiple XML files, it becomes tedious to generate each file separately and then provide them to Bambus with multiple -x options. You can create a Makefile compatible with GNU make that generates each of the XML files. You must be careful that the targets for all the XML files end in ".xml", and no other targets (such as intermediate targets) have the same suffix. Bambus will assume that all .xml files produced by the Makefile must be added to the input.

An example of a simple Makefile that can generate the mummer information described above is:

test.mum.xml: ref.fasta test.fasta
nucmer -a max-match ref.fasta test.fasta
show-tiling -x out.delta > test.mum.xml

==== Untangling scaffolds ====

By default, the scaffolds resulting from Bambus are potentially ambiguous as two or more contigs may occupy the same place in the genome. Such situations occur either due to misassembled repeats, or when assembling different haplotypes. Bambus contains a utility that does a best effort attempt to disambiguate the scaffolds, by breaking them into a set of non-ambiguous scaffolds. The algorithm is greedy and does not guarantee that an optimal solution (e.g. a minimum number of longest scaffolds) is produced.

To run the untangler type:

untangle -e test.evidence.xml -s test.out.xml -o test.untangle.xml

The test.untangle.xml file has the same format as the output file ([[#.out.xml|.out.xml]]) and for each scaffold (e.g. scaff_1) contains one or more unambiguous scaffolds called "scaff_1_1, scaff_1_2, etc". You can use the [[#Getting more (or less) information from the output|printScaff]] command to process the untangled file the same way you would process a normal Bambus output.

== File formats ==

=== The configuration file ===

Scaffolding parameters can be specified in a configuration file. Whenever you run Bambus a default configuration file is generated as default.conf. To create your own file you should rename this file then edit it. The following types of information can be specified:

* Comments. Lines starting with # are considered comments and are ignored by the software.

* Priority information. Specifies an order in which the libraries are considered. For example, the following line specifies priority 2 for all MUMmer links. The libraries are considered in the increasing order of priorities.

priority MUMmer 2

If no configuration file is provided, all libraries are assumed to have the same priority. If a configuration file is provided, only those libraries that have a priority record are used in scaffolding. In other words, if a library has no priority record associated with it, it will not be used in scaffolding.

A priority can also be specified for a specific redundancy parameter. For example, the following line specifies that those links between contigs supported by 4 or more sources of linking data should be processed at priority level 1.

priority redundancy 4 1

Note, however, that you need to also specify a set of libraries to be used at this priority level. A simple shortcut is specifying that all link classes should be processed at this redundancy.

priority ALL 1

If you are uncertain of which libraries your project uses, run Bambus without a configuration file. The libraries will be listed in a file ending in ".libs".

* Redundancy information. Specifies the minimum number of links required for a valid inter-contig link. You can specify this parameter as a global value:

redundancy 2

that is, all contigs must be linked by at least two links. You can also specify a per-library value like below which says a single MUMmer link is required to link two contigs.

redundancy MUMmer 1

* Minimum scaffold size. Specifies a scaffold size cutoff. For example thisimplies that only scaffolds that contain more than 10kbp are present in the output.:

mingroupsize 10000

* Link size error. For some link types, determining the exact link length can only be done within a certain error. This parameter specifies, as a percentage, the estimated error in size determination. For example, since MUMmer links make an assumption of evolutionary closeness between two genomes, the error associated with them should be proportional to the estimated evolutionary distance (as nucleotide % similarity) between the two genomes. For an error of 5% use this parameter:

error MUMmer 0.05

* Overlapping contigs allowed. For each library you can control the link validation process by specifying whether adjacent contigs are allowed to overlap or not. By default mate-pair links assume adjacent contigs do not overlap, while MUMmer links allow contigs to overlap. This is equivalent to the configuration parameters:

overlaps lib_1 N
overlaps MUMmer Y

An example is provided in [[bambus.conf|data/sample_files/bambus.conf]].

=== The .mates file ===

The .mates file provides two types of information: library data, and mate-pair relationships between reads.

Library data can be described in two formats:
library <name> <min_size> <max_size>

and
library <name> <min_size> <max_size> <regexp>

Both formats require you to name each library and to provide a size range for the inserts belonging to it. The second format allows you to also provide a Perl regular expression that describes the naming convention for reads belonging to the library. The part of the sequence name that represents the library name must be placed within parentheses. As an example, the regular expression for TIGR sequences (where the library is specified by the first 4 characters) is:
(....).*

Mate-pair relationships can also be described in two ways:
pair <regexp_forw> <regexp_rev>

or
<seq_forw_1> <seq_rev_1> <library_name>
<seq_forw_2> <seq_rev_2> <library_name>
<seq_forw_3> <seq_rev_3> <library_name>
...

The first format requires two regular expressions corresponding to the forward and reverse mates of an insert. Just like the library record, the portion of the name corresponding to the insert name must be placed within parentheses. Two reads that match the two regular expressions, and have the exact same section matched within the parentheses will be considered mates. The library will be determined from the regular expression associated with a "library" record. As an example, at TIGR the first 7 characters represent the insert, then are followed by an optional T or P and the primer name (F or R for Forward or Reverse). The corresponding regular expression is:
pair (.......)[TP]?F (.......)[TP]?R

The second format is simply a list of sequence name pairs each followed by the name of the library they belong to. The corresponding library records must be listed in the file before the pairing data.

'''Note that fields in the .mates file must be separated by TAB characters otherwise the program will report an error.'''

An example of a .mates file using the phred/phrap naming convention is provided in [[bambus.mates|data/sample_files/bambus.mates]].

=== Repeat files ===

The repeat files used in screening linking data conform to the output standards of the repeatFinder program. The file contains 5 TAB-delimited values: contig ID, repeat name, left and right coordinates within the contig, and contig class. Only the first 4 values are essential as far as Bambus is concerned. Here is an example of repeat records:

AC009139.7.3 RPT1A 14554 96820 1
AC026498.3.2 RPT1B 124208 206475 1

=== .evidence.xml ===

This provides an XML representation of the input file. Below is a description of the data presented in this file (see also an [[bambus.evidence.xml|example]]):

The overall flow of the XML file is:

<EVIDENCE>
<LIBRARY>
<INSERT>
<SEQUENCE/>
<SEQUENCE/>
</INSERT>
...
</LIBRARY>
...
<CONTIG>
<SEQUENCE/>
...
</CONTIG>
...
<LINK>
<CONTIG/>
<CONTIG/>
</LINK>
...
</EVIDENCE>

<EVIDENCE this is the parent tag - the whole document occurs within <EVIDENCE> and </EVIDENCE>
ID = "1" some identifier for the file
DATE = "12/15/03" date when file was created
PROJECT = "MyProject" verbose description of the project
PARAMETERS = "" parameters used in creating this file
>

<LIBRARY for each shotgun library you need a separate library tag.
ID = "lib_1" some identifier for the library
NAME = "short" verbose name for the library
MIN = "1200" minimum size of inserts in this library
MAX = "3500" maximum size of inserts in this library
>

<INSERT the insert concept links together the two reads obtained from opposite ends of a clone insert
ID="ins_1" generic identifier for the insert
NAME="GALBZ92" verbose name for the insert
>

<SEQUENCE each insert contains two sequence reads obtained from to the opposite ends
ID="seq_1" generic identifier for the sequence
NAME="GALBZ92TF" verbose name for the sequence
>

<CONTIG Each contig in the assembly needs to be represented in the file
ID="contig_1" contig identifier
NAME= "1" contig name (usually the same as the identifier)
LEN="12352" contig length in basepairs
>

<SEQUENCE each sequence in the contig is listed together with information about the position within the contig.
ID="seq_1" sequence identifier (must match one described in the library section)
ORI="BE" orientation of sequence: BE - forward, EB - reverse
ASM_LEND="0" coordinate of sequence's left end within the contig
ASM_REND="525" coordinate of sequence's right end within the contig
>

<LINK this is a generic link between two contigs (i.e. not inferred from mate-pair data)
ID = "link_1" link identifier
SIZE = "-800" size of gap between the two contigs
TYPE = "MUMmer" link type: links with the same type get grouped into a virtual "library" used in specifying priorities
>

<CONTIG each link contains two contigs
ID = "contig_1" contig identifier: must match one described in the contig section above
ORI = "EB" contig orientation: BE - forward, EB - reverse
> each contig record may contain free-form data specifying the evidence for linking: e.g. alignment data

=== .out.xml ===

Represents, together with the evidence file, a description of the layout of the contigs. For each scaffold, the order (given as a coordinate along a chromosome) and the orientation of each contig is presented, together with a list of all the links used to generate this layout. Some links are deemed invalid, being given a code of "LEN" in case the length was deemed incorrect, or "ORI" in case the link orientation was considered incorrect. This file is also specified by the [[bambus.dtd|DTD]] and an example ([[bambus.out.xml|data/sample_files/bambus.out.xml]]).

=== .stats file ===

Finally, BAMBUS outputs a summary of the scaffolds generated. This file is pretty much self-explanatory, as evidenced by this example ([[bambus.stats|data/sample_files/bambus.stats]]). Note that in the .stats file, the N50 sizes are computed with respect to the total span of the scaffolds unless a different genome size is specified in a file called genome.size.

=== .details file ===

The .details file contains detailed information about the linking information between adjacent contigs. The contigs' orientation, size, and coordinates are listed and then all the linking data grouped by validity and library. An example is provided in [[bambus.details|data/sample_files/bambus.details]].

=== .dot file ===

Represents a graphical representation of the scaffolds in GraphViz format. Please see the [http://www.research.att.com/sw/tools/graphviz/ AT&T GraphViz] website for more information on this file format.

=== .ps file ===

A postscript image generated from the GraphViz-formatted file. It can be obtained from the .dot file with the command:
dot -Tps -o prefix.ps prefix.dot

An example is given in the picture below.
[[Image:bambus-display.jpg]]

Each scaffold is placed in a box, labeled with some statistics on the scaffold size (number of contigs, number of bases and span). All the "gaps" - linking relationships between the contigs are represented as edges. Each edge is decorated with the number of links contributing to it, the number of links from each library type, and the number of links invalidated due to incorrect length (L) or orientation (O). Each contig's ID, size, and coordinates within the scaffold are also listed.

=== .oo file ===

This file contains a summary of the order and orientation of all contigs present in the data. Each scaffold starts with a FASTA-like header containing the identifier for the scaffold followed by the number of contigs, size and span. Within each scaffold, all contigs are listed in the scaffold order, followed by the string BE for those in the forward orientation and EB for those in the reverse orientation. An example file is shown in: [[bambus.oo|data/sample_files/bambus.oo]].

=== .sum file ===

This file contains one line for each scaffold. Each line contains the scaffold ID, number of contigs, scaffold size, and span. The four values are separated by TAB characters. See an example in [[bambus.sum|data/sample_files/bambus.sum]].

== Known problems ==
There is a small "off-by-one" error in computing contig coordinates. It should not really affect the usefulness of the output.

== Contact information ==

BAMBUS is currently provided AS-IS, in other words we do not provide any support for the software. We would, however, like to hear your comments and suggestions. For Bambus bug reports, support requests, or any other inquiries please browse our SourceForge project page or Email us at:

amos-help (at) lists (dot) sourceforge (dot) net

BAMBUS was written by Mihai Pop and Dan Kosack.

Bambus Manual

2010-12-16T15:06:59Z

Dmb000006: /* The .mates file */ More tabs. Minor formatting changes

{| align="right"
| __TOC__
|}

The output of most shotgun sequence assembly programs (such as TIGR Assembler, phrap, or CAP3) consists in a set of un-related contigs, whose order and orientation along the chromosome is unknown. Scaffolding represents the task of ordering and orienting these contigs by using additional information about their relative placement. Traditionally such information was identified from the pairing of reads from the opposite ends of an insert in double-barelled shotgun experiments (see figure).

[[Image:scaffold image.gif]]

Besides clone mate information, homology data, physical maps, or gene synteny information can be used to derive relationships between contigs. BAMBUS is a program written to handle all such types of data in a generic fashion in order to build contig scaffolds. In order to account for the varied quality of linking information BAMBUS allows the user to specify a hierarchy that will consider the most reliable linking data first, then expand the scaffolds using less reliable data. This approach minimizes the effect of errors inherent to experimentally-derived linking information.

=== Algorithm overview ===

BAMBUS has two main modes of operation: hierarchical - when the links are considered in the order of their priorities, and standard - when all the links are considered at the same priority level. In the latter case, all links connecting two contigs are bundled together to form a "gap", or "edge" if you look at the problem from a graph theoretical point of view. Each link is checked for validity before being added to a bundle. Thus, a link is considered invalid due to a length constraint if it forces the two contigs to overlap. In other words, if we assume the largest possible size for the insert corresponding to the link, if the coordinates of the mate-pair within the contigs force the contigs to overlap, we discard the link assuming it is due to a misassembly (Note: this behaviour can be specified on a link-by-link basis). Links that pass this test are further checked for consistency. If they do not all agree in the relative orientations of the contigs they imply, a majority rule is used to retain only those links that agree both in orientation and length. Furthermore, the bundle, or edge, is retained if it contains at least two links (this parameter, redundancy, can actually be tuned by the user). After this step is complete, the resulting graph gets traversed twice, once to assign consistent orientations to all the contigs, and the second time to determine the order of the contigs along the chromosome. Note that inconsistencies in order are currently allowed since they can provide finishing teams with useful information. In a future version of BAMBUS we will add an option to allow the generation of unambiguous scaffolds as well. Currently you can generate such unambiguous scaffolds using the [[#Untangling scaffolds|untangle]] program.

== Installation ==

=== Prerequisites ===

* Install AMOS
* Perl 5.6 or later
* [http://www.cpan.org/ XML::Parser] perl module
* [http://www.cpan.org/ Config::IniFiles] perl module
* [http://www.research.att.com/sw/tools/graphviz/ GraphViz] package

=== Obtaining BAMBUS ===

BAMBUS 2.3 is available free of charge under the open-source Artistic License.

The Bambus source if freely available for download from the File Release Section of our SourceForge project page.

To receive information regarding new releases and developments, please subscribe to our moderated, low-traffic users' mailing list:

amos-users(at)lists(dot)sourceforge(dot)net

=== Actual installation ===

1. Find a directory where you want the software installed. For example /users/home/CoolUser

2. Place the tar file bambus-2.33.tar.gz in this directory

3. Unpack the distribution
% tar xvzf bambus-2.33.tar.gz

4. Go into the newly created directory
% cd bambus-2.33

5. Carefully read the documentation

6. Edit the file Makefile and change the value of BASEDIR to the correct installation path, for example:
BASEDIR = /users/home/CoolUser

7. Also check if the path for perl (in variable PERL) matches the one on your system. Several common options are:
PERL = /usr/local/bin/perl
PERL = /usr/bin/perl

8. Type 'gmake install' to have all the software installed. The executables will be in /users/home/CoolUser/bin/ and the documentation in /users/home/CoolUser/doc.

9. To get started you can try out the small test dataset provided in data/test_run by typing:

goBambus -c test.contig -m test.mates -o test-bambus

== Running BAMBUS ==

The input to Bambus consists in a set of links between contigs. These links can be inferred from the pairing of reads belonging to the same insert, from physical map data, or from alignment to another genome. To accomodate these types of sources of linking information, and to allow for further extensions, the input to Bambus is presented in a general purpose XML format.

The following sections describe how the XML file can be generated for each type of linking data. For more information see [[#.evidence.xml|.evidence.xml]].

Linking information is grouped into "libraries". All links within a library have similar parameters. In the case of mate-pair linking information the libraries match the standard definition. For other linking data, for example MUMmer links, the library grouping reflects specific characteristics of the linking data. For example all MUMmer links are grouped within a library called "MUMmer".

=== Command line options ===

Bambus consists in a collection of programs controlled by a script called goBambus. The execution is controlled by the following sets of parameters. To obtain a list of all parameters and basic help information you need to type:

goBambus -h
or
goBambus -help

==== Configuration options ====

* -C <conf_file> . This parameter specifies the configuration file used by Bambus. A configuration file is not required, however it is useful if you want to change the default parameters. See [[#The configuration file|the configuration file]] for a detailed description of the configuration information.

Whenever you run Bambus it will generate a default configuration file called default.conf. If you want to modify the parameters, you will need to rename this file and then edit it.

==== Input Options ====

* -a <asm_file> . Use a TIGR Assembler .asm file as source of linking information.
* -c <contig_file> . Use a GDE formatted .contig assembly file as source of linking information. The .contig files can be obtained either from TIGR Assembler output (by concatenating the contents of the .align directory) or by converting .ace files (created by phrap or Consed) using the [[ace2contig]] package.
* -x <xml_file>. Use additional XML formatted linking information. You can use the -x option multiple times. See [[#.evidence.xml|.evidence.xml]] for the XML file format.
* -mx <make_file>. Use a gmake compatible make file to specify how the additional XML information is generated. Each of the files created must end in ".xml". Moreover, only the final XML targets are allowed to end in ".xml" to prevent goBambus from getting confused. See [[#Using Makefiles|using Makefiles]] for a more detailed description on using makefiles.

==== Mate-pair Information ====

Assembly output (such as .asm and .contig files) does not generally contain any information about the pairing of reads coming from opposite ends of the same insert, nor about the membership of inserts to libraries. The following parameters specify how to obtain this mate-pair information.

* -m <mates_file> . The mates file allows you to specify both library information and mate pairing information, using a system based on Perl regular expressions. See [[#The .mates file|the .mates file]].
* -D <database> . At TIGR, instead of a mates file you can opt to use a database to obtain library and mate information. Note that the -m and -D options are complementary. The following three parameters specify database options:
* -S <server> . Select a specific database server
* -U <user> . Log in with the specified user name
* -P <passwd> . Log in with the specified password

==== Output Options ====

* -o <output_prefix> . All the output file names will be generated from the <output_prefix> prefix.

==== Repeat Screening ====

Misassembled repeats can confuse Bambus, therefore it makes sense to screen the known repeats that may be misassembled. Bambus can use two types of repeat screening information:

* -r . Screen against the repeats listed in the database (option -D must be provided)
* -r <repfile> . Screen against repeats described in [[#Repeat files|repeat files]].

==== Flow-control ====

* -start <num>
* -end <num>. The execution of Bambus consists in a set of steps. The user can choose to start or end at a specific step for example in the case when she modifies the configuration file and wants to re-run just the final steps of the scaffolders. This feature should be used with caution.

=== Typical use cases ===
==== Using assembler output ====

For convenience, Bambus can use the output of an assembly program and automatically generate the required linking information. Two types of information are required:

* the tiling of reads in the assembly
* information about the pairing of reads

The first type of information can be obtained from the .asm or .contig output formats of TIGR Assembler. For more information about these file formats please refer to the [http://www.jcvi.org/cms/publications/listing/abstract/article/tigr-assembler-a-new-tool-for-assembling-large-shotgun-sequencing-projects/ TIGR Assembler documentation]. Note that the .contig file may be stripped of all sequence data as the only information used is that contained in the lines starting with #. In case you are using an assembler other than TIGR Assembler you may need to use the ta2ace package to convert the output into the .contig format.

The second type of information can be obtained from a [[#The .mates file|.mates file]], or from the database in case you are at TIGR.

A typical command line for someone outside of TIGR is:

goBambus -c test.contig -m test.mates -o test

and the execution will generate the following files:

* [[#.stats file|test.stats]] - statistics on scaffolds and libraries
* [[#.details file|test.details]] - detailed information about each contig pair (also called "gap").
* [[#.dot file|test.dot]] - GraphViz formatted description of the contig linkage information.
* [[#.evidence.xml|test.evidence.xml]] - XML representation of all the linking evidence provided.
* [[#.out.xml|test.out.xml]] - XML representation of the scaffolds.
* [[#.ps file|test.lib]] - list of the codes associated with each input library. These codes are reported on the links in the output file.
* [[#.sum file|test.sum]] - one-line sumaries of all scaffolds.
* [[#.oo file|test.oo]] - order and orientation information for all the contigs.

Additionally, the program generates some "working" files that are currently not removed as they provide useful debugging information.

* test.detective.xml - linking information derived from assembler input alone. It's the same as test.evidence.xml unless additional XML files are provided in the input.
* test.inp - input to the core scaffolding engine.
* test.grommit.conf - configuration information for the scaffolding engine

To view the graphical output you need to convert it to Postscript with the command:

dot -Tps -o test.ps test.dot

Then you can view the postscript file with the command:

gv test.ps

At TIGR, instead of specifying a .mates file you can use the database:

goBambus -c test.contig -D gbx -U access -P access -o test

==== Getting more (or less) information from the output ====

You can modify the information you get in the output by directly calling the printScaff command. PrintScaff requires as inputs the .evidence.xml file, the .out.xml file and the .lib files produce by Bambus, the minimal invocation being:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib

Optionally you may specify the output prefix with option -o (just like in the Bambus invocation). PrintScaff has parameters that control what gets reported, and in what format. In the first category you have:

* -dot - produce a .dot file
* -detail - produce a .details file
* -oo - produce a [[#.oo file|.oo]] file listing all the contigs in each scaffold
* -sum - produce a [[#.sum file|.sum]] tab delimited list of scaffold stats (#contigs, size, and span)
* -f <fasta_file> - generate a pseudo-molecule for each scaffold using the contig sequences listed in <fasta_file>.
note that the contigs in the scaffold files are names "contig_<id>" while the contigs in the <fasta_file> must simply be called "<id>". Note that this option together with the default -merge option (see below) is meaningless unless you've [[#Untangling scaffolds|untangled]] the scaffold.
* -phys - create a .phys file listing all "gaps" spanned only by the specified libraries:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib -phys MUMmer

will generate a file called test.phys that contains all contig pairs linked by nothing but MUMmer links.

In the second category you have:

* -page - produce .dot file formatted for printing on 8.5x11" paper
* -plot - produce .dot file formatted for printing on a plotter (36x48" paper)
* -unused - draw edges corresponding to unused links
* -merge - (default) when the -f option is given, produce a pseudo-molecule for each scaffold by adding 60 N characters between the contigs in the scaffold
* -nomerge - when the -f option is given, create a fasta file containing all contigs in each scaffold in the correct orientation

==== Adding additional XML linking information ====

You can easily add additional linking information as long as it's in a format similar to the [[#.evidence.xml file|.evidence.xml file]]. Assuming we have two such files "link1.xml" and "link2.xml" you can call Bambus as follows:

goBambus -c test.contig -m test.mates -x link1.xml -x link2.xml -o test

The result being the same as if you concatenated the XML file inferred from the .contig file together with the two additional XML files.

==== Using MUMmer links ====

In case you are sequencing a genome for which a close relative has been completed, you can use [[MUMmer]] to infer links between contigs. Assume you have the reference genome in a file "ref.fasta" and all the contigs from the genome you are assembling in a file called "test.fasta", you can run MUMmer to align the test genome to the reference:

nucmer -maxmatch ref.fasta test.fasta

The output will be placed in a file called out.delta. You can then convert the alignment information into an XML file suitable for Bambus as follows:

show-tiling -x out.delta > test.mum.xml

And finally you can provide this information to Bambus:

goBambus -c test.contig -m test.mates -x test.mum.xml -o test

==== Using Makefiles ====

If you have multiple XML files, it becomes tedious to generate each file separately and then provide them to Bambus with multiple -x options. You can create a Makefile compatible with GNU make that generates each of the XML files. You must be careful that the targets for all the XML files end in ".xml", and no other targets (such as intermediate targets) have the same suffix. Bambus will assume that all .xml files produced by the Makefile must be added to the input.

An example of a simple Makefile that can generate the mummer information described above is:

test.mum.xml: ref.fasta test.fasta
nucmer -a max-match ref.fasta test.fasta
show-tiling -x out.delta > test.mum.xml

==== Untangling scaffolds ====

By default, the scaffolds resulting from Bambus are potentially ambiguous as two or more contigs may occupy the same place in the genome. Such situations occur either due to misassembled repeats, or when assembling different haplotypes. Bambus contains a utility that does a best effort attempt to disambiguate the scaffolds, by breaking them into a set of non-ambiguous scaffolds. The algorithm is greedy and does not guarantee that an optimal solution (e.g. a minimum number of longest scaffolds) is produced.

To run the untangler type:

untangle -e test.evidence.xml -s test.out.xml -o test.untangle.xml

The test.untangle.xml file has the same format as the output file ([[#.out.xml|.out.xml]]) and for each scaffold (e.g. scaff_1) contains one or more unambiguous scaffolds called "scaff_1_1, scaff_1_2, etc". You can use the [[#Getting more (or less) information from the output|printScaff]] command to process the untangled file the same way you would process a normal Bambus output.

== File formats ==

=== The configuration file ===

Scaffolding parameters can be specified in a configuration file. Whenever you run Bambus a default configuration file is generated as default.conf. To create your own file you should rename this file then edit it. The following types of information can be specified:

* Comments. Lines starting with # are considered comments and are ignored by the software.

* Priority information. Specifies an order in which the libraries are considered. For example, the following line specifies priority 2 for all MUMmer links. The libraries are considered in the increasing order of priorities.

priority MUMmer 2

If no configuration file is provided, all libraries are assumed to have the same priority. If a configuration file is provided, only those libraries that have a priority record are used in scaffolding. In other words, if a library has no priority record associated with it, it will not be used in scaffolding.

A priority can also be specified for a specific redundancy parameter. For example, the following line specifies that those links between contigs supported by 4 or more sources of linking data should be processed at priority level 1.

priority redundancy 4 1

Note, however, that you need to also specify a set of libraries to be used at this priority level. A simple shortcut is specifying that all link classes should be processed at this redundancy.

priority ALL 1

If you are uncertain of which libraries your project uses, run Bambus without a configuration file. The libraries will be listed in a file ending in ".libs".

* Redundancy information. Specifies the minimum number of links required for a valid inter-contig link. You can specify this parameter as a global value:

redundancy 2

that is, all contigs must be linked by at least two links. You can also specify a per-library value like below which says a single MUMmer link is required to link two contigs.

redundancy MUMmer 1

* Minimum scaffold size. Specifies a scaffold size cutoff. For example thisimplies that only scaffolds that contain more than 10kbp are present in the output.:

mingroupsize 10000

* Link size error. For some link types, determining the exact link length can only be done within a certain error. This parameter specifies, as a percentage, the estimated error in size determination. For example, since MUMmer links make an assumption of evolutionary closeness between two genomes, the error associated with them should be proportional to the estimated evolutionary distance (as nucleotide % similarity) between the two genomes. For an error of 5% use this parameter:

error MUMmer 0.05

* Overlapping contigs allowed. For each library you can control the link validation process by specifying whether adjacent contigs are allowed to overlap or not. By default mate-pair links assume adjacent contigs do not overlap, while MUMmer links allow contigs to overlap. This is equivalent to the configuration parameters:

overlaps lib_1 N
overlaps MUMmer Y

An example is provided in [[bambus.conf|data/sample_files/bambus.conf]].

=== The .mates file ===

The .mates file provides two types of information: library data, and mate-pair relationships between reads.

Library data can be described in two formats:
library <name> <min_size> <max_size>

and
library <name> <min_size> <max_size> <regexp>

Both formats require you to name each library and to provide a size range for the inserts belonging to it. The second format allows you to also provide a Perl regular expression that describes the naming convention for reads belonging to the library. The part of the sequence name that represents the library name must be placed within parentheses. As an example, the regular expression for TIGR sequences (where the library is specified by the first 4 characters) is:
(....).*

Mate-pair relationships can also be described in two ways:
pair <regexp_forw> <regexp_rev>

or
<seq_forw_1> <seq_rev_1> <library_name>
<seq_forw_2> <seq_rev_2> <library_name>
<seq_forw_3> <seq_rev_3> <library_name>
...

The first format requires two regular expressions corresponding to the forward and reverse mates of an insert. Just like the library record, the portion of the name corresponding to the insert name must be placed within parentheses. Two reads that match the two regular expressions, and have the exact same section matched within the parentheses will be considered mates. The library will be determined from the regular expression associated with a "library" record. As an example, at TIGR the first 7 characters represent the insert, then are followed by an optional T or P and the primer name (F or R for Forward or Reverse). The corresponding regular expression is:
pair (.......)[TP]?F (.......)[TP]?R

The second format is simply a list of sequence name pairs each followed by the name of the library they belong to. The corresponding library records must be listed in the file before the pairing data.

'''Note that fields in the .mates file must be separated by TAB characters otherwise the program will report an error.'''

An example of a .mates file using the phred/phrap naming convention is provided in [[bambus.mates|data/sample_files/bambus.mates]].

=== Repeat files ===

The repeat files used in screening linking data conform to the output standards of the repeatFinder program. The file contains 5 TAB-delimited values: contig ID, repeat name, left and right coordinates within the contig, and contig class. Only the first 4 values are essential as far as Bambus is concerned. Here is an example of repeat records:

AC009139.7.3 RPT1A 14554 96820 1
AC026498.3.2 RPT1B 124208 206475 1

=== .evidence.xml ===

This provides an XML representation of the input file. Below is a description of the data presented in this file (see also an [[bambus.evidence.xml|example]]):

The overall flow of the XML file is:

<EVIDENCE>
<LIBRARY>
<INSERT>
<SEQUENCE/>
<SEQUENCE/>
</INSERT>
...
</LIBRARY>
...
<CONTIG>
<SEQUENCE/>
...
</CONTIG>
...
<LINK>
<CONTIG/>
<CONTIG/>
</LINK>
...
</EVIDENCE>

<EVIDENCE this is the parent tag - the whole document occurs within <EVIDENCE> and </EVIDENCE>
ID = "1" some identifier for the file
DATE = "12/15/03" date when file was created
PROJECT = "MyProject" verbose description of the project
PARAMETERS = "" parameters used in creating this file
>

<LIBRARY for each shotgun library you need a separate library tag.
ID = "lib_1" some identifier for the library
NAME = "short" verbose name for the library
MIN = "1200" minimum size of inserts in this library
MAX = "3500" maximum size of inserts in this library
>

<INSERT the insert concept links together the two reads obtained from opposite ends of a clone insert
ID="ins_1" generic identifier for the insert
NAME="GALBZ92" verbose name for the insert
>

<SEQUENCE each insert contains two sequence reads obtained from to the opposite ends
ID="seq_1" generic identifier for the sequence
NAME="GALBZ92TF" verbose name for the sequence
>

<CONTIG Each contig in the assembly needs to be represented in the file
ID="contig_1" contig identifier
NAME= "1" contig name (usually the same as the identifier)
LEN="12352" contig length in basepairs
>

<SEQUENCE each sequence in the contig is listed together with information about the position within the contig.
ID="seq_1" sequence identifier (must match one described in the library section)
ORI="BE" orientation of sequence: BE - forward, EB - reverse
ASM_LEND="0" coordinate of sequence's left end within the contig
ASM_REND="525" coordinate of sequence's right end within the contig
>

<LINK this is a generic link between two contigs (i.e. not inferred from mate-pair data)
ID = "link_1" link identifier
SIZE = "-800" size of gap between the two contigs
TYPE = "MUMmer" link type: links with the same type get grouped into a virtual "library" used in specifying priorities
>

<CONTIG each link contains two contigs
ID = "contig_1" contig identifier: must match one described in the contig section above
ORI = "EB" contig orientation: BE - forward, EB - reverse
> each contig record may contain free-form data specifying the evidence for linking: e.g. alignment data

=== .out.xml ===

Represents, together with the evidence file, a description of the layout of the contigs. For each scaffold, the order (given as a coordinate along a chromosome) and the orientation of each contig is presented, together with a list of all the links used to generate this layout. Some links are deemed invalid, being given a code of "LEN" in case the length was deemed incorrect, or "ORI" in case the link orientation was considered incorrect. This file is also specified by the [[bambus.dtd|DTD]] and an example ([[bambus.out.xml|data/sample_files/bambus.out.xml]]).

=== .stats file ===

Finally, BAMBUS outputs a summary of the scaffolds generated. This file is pretty much self-explanatory, as evidenced by this example ([[bambus.stats|data/sample_files/bambus.stats]]). Note that in the .stats file, the N50 sizes are computed with respect to the total span of the scaffolds unless a different genome size is specified in a file called genome.size.

=== .details file ===

The .details file contains detailed information about the linking information between adjacent contigs. The contigs' orientation, size, and coordinates are listed and then all the linking data grouped by validity and library. An example is provided in [[bambus.details|data/sample_files/bambus.details]].

=== .dot file ===

Represents a graphical representation of the scaffolds in GraphViz format. Please see the [http://www.research.att.com/sw/tools/graphviz/ AT&T GraphViz] website for more information on this file format.

=== .ps file ===

A postscript image generated from the GraphViz-formatted file. It can be obtained from the .dot file with the command:
dot -Tps -o prefix.ps prefix.dot

An example is given in the picture below.
[[Image:bambus-display.jpg]]

Each scaffold is placed in a box, labeled with some statistics on the scaffold size (number of contigs, number of bases and span). All the "gaps" - linking relationships between the contigs are represented as edges. Each edge is decorated with the number of links contributing to it, the number of links from each library type, and the number of links invalidated due to incorrect length (L) or orientation (O). Each contig's ID, size, and coordinates within the scaffold are also listed.

=== .oo file ===

This file contains a summary of the order and orientation of all contigs present in the data. Each scaffold starts with a FASTA-like header containing the identifier for the scaffold followed by the number of contigs, size and span. Within each scaffold, all contigs are listed in the scaffold order, followed by the string BE for those in the forward orientation and EB for those in the reverse orientation. An example file is shown in: [[bambus.oo|data/sample_files/bambus.oo]].

=== .sum file ===

This file contains one line for each scaffold. Each line contains the scaffold ID, number of contigs, scaffold size, and span. The four values are separated by TAB characters. See an example in [[bambus.sum|data/sample_files/bambus.sum]].

== Known problems ==
There is a small "off-by-one" error in computing contig coordinates. It should not really affect the usefulness of the output.

== Contact information ==

BAMBUS is currently provided AS-IS, in other words we do not provide any support for the software. We would, however, like to hear your comments and suggestions. For Bambus bug reports, support requests, or any other inquiries please browse our SourceForge project page or Email us at:

amos-help (at) lists (dot) sourceforge (dot) net

BAMBUS was written by Mihai Pop and Dan Kosack.

Bambus Manual

2010-12-16T15:03:40Z

Dmb000006: /* The .mates file */ Putting tabs in the examples instead of spaces. Changing formatting slightly and emphasising the phrase about tabs. fixed a few typos.

{| align="right"
| __TOC__
|}

The output of most shotgun sequence assembly programs (such as TIGR Assembler, phrap, or CAP3) consists in a set of un-related contigs, whose order and orientation along the chromosome is unknown. Scaffolding represents the task of ordering and orienting these contigs by using additional information about their relative placement. Traditionally such information was identified from the pairing of reads from the opposite ends of an insert in double-barelled shotgun experiments (see figure).

[[Image:scaffold image.gif]]

Besides clone mate information, homology data, physical maps, or gene synteny information can be used to derive relationships between contigs. BAMBUS is a program written to handle all such types of data in a generic fashion in order to build contig scaffolds. In order to account for the varied quality of linking information BAMBUS allows the user to specify a hierarchy that will consider the most reliable linking data first, then expand the scaffolds using less reliable data. This approach minimizes the effect of errors inherent to experimentally-derived linking information.

=== Algorithm overview ===

BAMBUS has two main modes of operation: hierarchical - when the links are considered in the order of their priorities, and standard - when all the links are considered at the same priority level. In the latter case, all links connecting two contigs are bundled together to form a "gap", or "edge" if you look at the problem from a graph theoretical point of view. Each link is checked for validity before being added to a bundle. Thus, a link is considered invalid due to a length constraint if it forces the two contigs to overlap. In other words, if we assume the largest possible size for the insert corresponding to the link, if the coordinates of the mate-pair within the contigs force the contigs to overlap, we discard the link assuming it is due to a misassembly (Note: this behaviour can be specified on a link-by-link basis). Links that pass this test are further checked for consistency. If they do not all agree in the relative orientations of the contigs they imply, a majority rule is used to retain only those links that agree both in orientation and length. Furthermore, the bundle, or edge, is retained if it contains at least two links (this parameter, redundancy, can actually be tuned by the user). After this step is complete, the resulting graph gets traversed twice, once to assign consistent orientations to all the contigs, and the second time to determine the order of the contigs along the chromosome. Note that inconsistencies in order are currently allowed since they can provide finishing teams with useful information. In a future version of BAMBUS we will add an option to allow the generation of unambiguous scaffolds as well. Currently you can generate such unambiguous scaffolds using the [[#Untangling scaffolds|untangle]] program.

== Installation ==

=== Prerequisites ===

* Install AMOS
* Perl 5.6 or later
* [http://www.cpan.org/ XML::Parser] perl module
* [http://www.cpan.org/ Config::IniFiles] perl module
* [http://www.research.att.com/sw/tools/graphviz/ GraphViz] package

=== Obtaining BAMBUS ===

BAMBUS 2.3 is available free of charge under the open-source Artistic License.

The Bambus source if freely available for download from the File Release Section of our SourceForge project page.

To receive information regarding new releases and developments, please subscribe to our moderated, low-traffic users' mailing list:

amos-users(at)lists(dot)sourceforge(dot)net

=== Actual installation ===

1. Find a directory where you want the software installed. For example /users/home/CoolUser

2. Place the tar file bambus-2.33.tar.gz in this directory

3. Unpack the distribution
% tar xvzf bambus-2.33.tar.gz

4. Go into the newly created directory
% cd bambus-2.33

5. Carefully read the documentation

6. Edit the file Makefile and change the value of BASEDIR to the correct installation path, for example:
BASEDIR = /users/home/CoolUser

7. Also check if the path for perl (in variable PERL) matches the one on your system. Several common options are:
PERL = /usr/local/bin/perl
PERL = /usr/bin/perl

8. Type 'gmake install' to have all the software installed. The executables will be in /users/home/CoolUser/bin/ and the documentation in /users/home/CoolUser/doc.

9. To get started you can try out the small test dataset provided in data/test_run by typing:

goBambus -c test.contig -m test.mates -o test-bambus

== Running BAMBUS ==

The input to Bambus consists in a set of links between contigs. These links can be inferred from the pairing of reads belonging to the same insert, from physical map data, or from alignment to another genome. To accomodate these types of sources of linking information, and to allow for further extensions, the input to Bambus is presented in a general purpose XML format.

The following sections describe how the XML file can be generated for each type of linking data. For more information see [[#.evidence.xml|.evidence.xml]].

Linking information is grouped into "libraries". All links within a library have similar parameters. In the case of mate-pair linking information the libraries match the standard definition. For other linking data, for example MUMmer links, the library grouping reflects specific characteristics of the linking data. For example all MUMmer links are grouped within a library called "MUMmer".

=== Command line options ===

Bambus consists in a collection of programs controlled by a script called goBambus. The execution is controlled by the following sets of parameters. To obtain a list of all parameters and basic help information you need to type:

goBambus -h
or
goBambus -help

==== Configuration options ====

* -C <conf_file> . This parameter specifies the configuration file used by Bambus. A configuration file is not required, however it is useful if you want to change the default parameters. See [[#The configuration file|the configuration file]] for a detailed description of the configuration information.

Whenever you run Bambus it will generate a default configuration file called default.conf. If you want to modify the parameters, you will need to rename this file and then edit it.

==== Input Options ====

* -a <asm_file> . Use a TIGR Assembler .asm file as source of linking information.
* -c <contig_file> . Use a GDE formatted .contig assembly file as source of linking information. The .contig files can be obtained either from TIGR Assembler output (by concatenating the contents of the .align directory) or by converting .ace files (created by phrap or Consed) using the [[ace2contig]] package.
* -x <xml_file>. Use additional XML formatted linking information. You can use the -x option multiple times. See [[#.evidence.xml|.evidence.xml]] for the XML file format.
* -mx <make_file>. Use a gmake compatible make file to specify how the additional XML information is generated. Each of the files created must end in ".xml". Moreover, only the final XML targets are allowed to end in ".xml" to prevent goBambus from getting confused. See [[#Using Makefiles|using Makefiles]] for a more detailed description on using makefiles.

==== Mate-pair Information ====

Assembly output (such as .asm and .contig files) does not generally contain any information about the pairing of reads coming from opposite ends of the same insert, nor about the membership of inserts to libraries. The following parameters specify how to obtain this mate-pair information.

* -m <mates_file> . The mates file allows you to specify both library information and mate pairing information, using a system based on Perl regular expressions. See [[#The .mates file|the .mates file]].
* -D <database> . At TIGR, instead of a mates file you can opt to use a database to obtain library and mate information. Note that the -m and -D options are complementary. The following three parameters specify database options:
* -S <server> . Select a specific database server
* -U <user> . Log in with the specified user name
* -P <passwd> . Log in with the specified password

==== Output Options ====

* -o <output_prefix> . All the output file names will be generated from the <output_prefix> prefix.

==== Repeat Screening ====

Misassembled repeats can confuse Bambus, therefore it makes sense to screen the known repeats that may be misassembled. Bambus can use two types of repeat screening information:

* -r . Screen against the repeats listed in the database (option -D must be provided)
* -r <repfile> . Screen against repeats described in [[#Repeat files|repeat files]].

==== Flow-control ====

* -start <num>
* -end <num>. The execution of Bambus consists in a set of steps. The user can choose to start or end at a specific step for example in the case when she modifies the configuration file and wants to re-run just the final steps of the scaffolders. This feature should be used with caution.

=== Typical use cases ===
==== Using assembler output ====

For convenience, Bambus can use the output of an assembly program and automatically generate the required linking information. Two types of information are required:

* the tiling of reads in the assembly
* information about the pairing of reads

The first type of information can be obtained from the .asm or .contig output formats of TIGR Assembler. For more information about these file formats please refer to the [http://www.jcvi.org/cms/publications/listing/abstract/article/tigr-assembler-a-new-tool-for-assembling-large-shotgun-sequencing-projects/ TIGR Assembler documentation]. Note that the .contig file may be stripped of all sequence data as the only information used is that contained in the lines starting with #. In case you are using an assembler other than TIGR Assembler you may need to use the ta2ace package to convert the output into the .contig format.

The second type of information can be obtained from a [[#The .mates file|.mates file]], or from the database in case you are at TIGR.

A typical command line for someone outside of TIGR is:

goBambus -c test.contig -m test.mates -o test

and the execution will generate the following files:

* [[#.stats file|test.stats]] - statistics on scaffolds and libraries
* [[#.details file|test.details]] - detailed information about each contig pair (also called "gap").
* [[#.dot file|test.dot]] - GraphViz formatted description of the contig linkage information.
* [[#.evidence.xml|test.evidence.xml]] - XML representation of all the linking evidence provided.
* [[#.out.xml|test.out.xml]] - XML representation of the scaffolds.
* [[#.ps file|test.lib]] - list of the codes associated with each input library. These codes are reported on the links in the output file.
* [[#.sum file|test.sum]] - one-line sumaries of all scaffolds.
* [[#.oo file|test.oo]] - order and orientation information for all the contigs.

Additionally, the program generates some "working" files that are currently not removed as they provide useful debugging information.

* test.detective.xml - linking information derived from assembler input alone. It's the same as test.evidence.xml unless additional XML files are provided in the input.
* test.inp - input to the core scaffolding engine.
* test.grommit.conf - configuration information for the scaffolding engine

To view the graphical output you need to convert it to Postscript with the command:

dot -Tps -o test.ps test.dot

Then you can view the postscript file with the command:

gv test.ps

At TIGR, instead of specifying a .mates file you can use the database:

goBambus -c test.contig -D gbx -U access -P access -o test

==== Getting more (or less) information from the output ====

You can modify the information you get in the output by directly calling the printScaff command. PrintScaff requires as inputs the .evidence.xml file, the .out.xml file and the .lib files produce by Bambus, the minimal invocation being:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib

Optionally you may specify the output prefix with option -o (just like in the Bambus invocation). PrintScaff has parameters that control what gets reported, and in what format. In the first category you have:

* -dot - produce a .dot file
* -detail - produce a .details file
* -oo - produce a [[#.oo file|.oo]] file listing all the contigs in each scaffold
* -sum - produce a [[#.sum file|.sum]] tab delimited list of scaffold stats (#contigs, size, and span)
* -f <fasta_file> - generate a pseudo-molecule for each scaffold using the contig sequences listed in <fasta_file>.
note that the contigs in the scaffold files are names "contig_<id>" while the contigs in the <fasta_file> must simply be called "<id>". Note that this option together with the default -merge option (see below) is meaningless unless you've [[#Untangling scaffolds|untangled]] the scaffold.
* -phys - create a .phys file listing all "gaps" spanned only by the specified libraries:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib -phys MUMmer

will generate a file called test.phys that contains all contig pairs linked by nothing but MUMmer links.

In the second category you have:

* -page - produce .dot file formatted for printing on 8.5x11" paper
* -plot - produce .dot file formatted for printing on a plotter (36x48" paper)
* -unused - draw edges corresponding to unused links
* -merge - (default) when the -f option is given, produce a pseudo-molecule for each scaffold by adding 60 N characters between the contigs in the scaffold
* -nomerge - when the -f option is given, create a fasta file containing all contigs in each scaffold in the correct orientation

==== Adding additional XML linking information ====

You can easily add additional linking information as long as it's in a format similar to the [[#.evidence.xml file|.evidence.xml file]]. Assuming we have two such files "link1.xml" and "link2.xml" you can call Bambus as follows:

goBambus -c test.contig -m test.mates -x link1.xml -x link2.xml -o test

The result being the same as if you concatenated the XML file inferred from the .contig file together with the two additional XML files.

==== Using MUMmer links ====

In case you are sequencing a genome for which a close relative has been completed, you can use [[MUMmer]] to infer links between contigs. Assume you have the reference genome in a file "ref.fasta" and all the contigs from the genome you are assembling in a file called "test.fasta", you can run MUMmer to align the test genome to the reference:

nucmer -maxmatch ref.fasta test.fasta

The output will be placed in a file called out.delta. You can then convert the alignment information into an XML file suitable for Bambus as follows:

show-tiling -x out.delta > test.mum.xml

And finally you can provide this information to Bambus:

goBambus -c test.contig -m test.mates -x test.mum.xml -o test

==== Using Makefiles ====

If you have multiple XML files, it becomes tedious to generate each file separately and then provide them to Bambus with multiple -x options. You can create a Makefile compatible with GNU make that generates each of the XML files. You must be careful that the targets for all the XML files end in ".xml", and no other targets (such as intermediate targets) have the same suffix. Bambus will assume that all .xml files produced by the Makefile must be added to the input.

An example of a simple Makefile that can generate the mummer information described above is:

test.mum.xml: ref.fasta test.fasta
nucmer -a max-match ref.fasta test.fasta
show-tiling -x out.delta > test.mum.xml

==== Untangling scaffolds ====

By default, the scaffolds resulting from Bambus are potentially ambiguous as two or more contigs may occupy the same place in the genome. Such situations occur either due to misassembled repeats, or when assembling different haplotypes. Bambus contains a utility that does a best effort attempt to disambiguate the scaffolds, by breaking them into a set of non-ambiguous scaffolds. The algorithm is greedy and does not guarantee that an optimal solution (e.g. a minimum number of longest scaffolds) is produced.

To run the untangler type:

untangle -e test.evidence.xml -s test.out.xml -o test.untangle.xml

The test.untangle.xml file has the same format as the output file ([[#.out.xml|.out.xml]]) and for each scaffold (e.g. scaff_1) contains one or more unambiguous scaffolds called "scaff_1_1, scaff_1_2, etc". You can use the [[#Getting more (or less) information from the output|printScaff]] command to process the untangled file the same way you would process a normal Bambus output.

== File formats ==

=== The configuration file ===

Scaffolding parameters can be specified in a configuration file. Whenever you run Bambus a default configuration file is generated as default.conf. To create your own file you should rename this file then edit it. The following types of information can be specified:

* Comments. Lines starting with # are considered comments and are ignored by the software.

* Priority information. Specifies an order in which the libraries are considered. For example, the following line specifies priority 2 for all MUMmer links. The libraries are considered in the increasing order of priorities.

priority MUMmer 2

If no configuration file is provided, all libraries are assumed to have the same priority. If a configuration file is provided, only those libraries that have a priority record are used in scaffolding. In other words, if a library has no priority record associated with it, it will not be used in scaffolding.

A priority can also be specified for a specific redundancy parameter. For example, the following line specifies that those links between contigs supported by 4 or more sources of linking data should be processed at priority level 1.

priority redundancy 4 1

Note, however, that you need to also specify a set of libraries to be used at this priority level. A simple shortcut is specifying that all link classes should be processed at this redundancy.

priority ALL 1

If you are uncertain of which libraries your project uses, run Bambus without a configuration file. The libraries will be listed in a file ending in ".libs".

* Redundancy information. Specifies the minimum number of links required for a valid inter-contig link. You can specify this parameter as a global value:

redundancy 2

that is, all contigs must be linked by at least two links. You can also specify a per-library value like below which says a single MUMmer link is required to link two contigs.

redundancy MUMmer 1

* Minimum scaffold size. Specifies a scaffold size cutoff. For example thisimplies that only scaffolds that contain more than 10kbp are present in the output.:

mingroupsize 10000

* Link size error. For some link types, determining the exact link length can only be done within a certain error. This parameter specifies, as a percentage, the estimated error in size determination. For example, since MUMmer links make an assumption of evolutionary closeness between two genomes, the error associated with them should be proportional to the estimated evolutionary distance (as nucleotide % similarity) between the two genomes. For an error of 5% use this parameter:

error MUMmer 0.05

* Overlapping contigs allowed. For each library you can control the link validation process by specifying whether adjacent contigs are allowed to overlap or not. By default mate-pair links assume adjacent contigs do not overlap, while MUMmer links allow contigs to overlap. This is equivalent to the configuration parameters:

overlaps lib_1 N
overlaps MUMmer Y

An example is provided in [[bambus.conf|data/sample_files/bambus.conf]].

=== The .mates file ===

The .mates file provides two types of information: library data, and mate-pair relationships between reads.

Library data can be described in two formats:
library <name> <min_size> <max_size>

and
library <name> <min_size> <max_size> <regexp>

Both formats require you to name each library and to provide a size range for the inserts belonging to it. The second format allows you to also provide a Perl regular expression that describes the naming convention for reads belonging to the library. The part of the sequence name that represents the library name must be placed within parentheses. As an example, the regular expression for TIGR sequences (where the library is specified by the first 4 characters) is:
(....).*

Mate-pair relationships can also be described in two ways:
pair <regexp_forw> <regexp_rev>

or
<seq_forw_1> <seq_rev_1> <library_name>
<seq_forw_2> <seq_rev_2> <library_name>
<seq_forw_3> <seq_rev_3> <library_name>
...

The first format requires two regular expressions corresponding to the forward and reverse mates of an insert. Just like the library record, the portion of the name corresponding to the insert name must be placed within parentheses. Two reads that match the two regular expressions, and have the exact same section matched within the parentheses will be considered mates. The library will be determined from the regular expression associated with a "library" record. As an example, at TIGR the first 7 characters represent the insert, then are followed by an optional T or P and the primer name (F or R for Forward or Reverse). The corresponding regular expression is:
pair (.......)[TP]?F (.......)[TP]?R

The second format is simply a list of sequence name pairs each followed by the name of the library they belong to. The corresponding library records must be listed in the file before the pairing data.

'''Note that fields in the .mates file must be separated by TAB characters otherwise the program will report an error.'''

An example of a .mates file using the phred/phrap naming convention is provided in [[bambus.mates|data/sample_files/bambus.mates]].

=== Repeat files ===

The repeat files used in screening linking data conform to the output standards of the repeatFinder program. The file contains 5 TAB-delimited values: contig ID, repeat name, left and right coordinates within the contig, and contig class. Only the first 4 values are essential as far as Bambus is concerned. Here is an example of repeat records:

AC009139.7.3 RPT1A 14554 96820 1
AC026498.3.2 RPT1B 124208 206475 1

=== .evidence.xml ===

This provides an XML representation of the input file. Below is a description of the data presented in this file (see also an [[bambus.evidence.xml|example]]):

The overall flow of the XML file is:

<EVIDENCE>
<LIBRARY>
<INSERT>
<SEQUENCE/>
<SEQUENCE/>
</INSERT>
...
</LIBRARY>
...
<CONTIG>
<SEQUENCE/>
...
</CONTIG>
...
<LINK>
<CONTIG/>
<CONTIG/>
</LINK>
...
</EVIDENCE>

<EVIDENCE this is the parent tag - the whole document occurs within <EVIDENCE> and </EVIDENCE>
ID = "1" some identifier for the file
DATE = "12/15/03" date when file was created
PROJECT = "MyProject" verbose description of the project
PARAMETERS = "" parameters used in creating this file
>

<LIBRARY for each shotgun library you need a separate library tag.
ID = "lib_1" some identifier for the library
NAME = "short" verbose name for the library
MIN = "1200" minimum size of inserts in this library
MAX = "3500" maximum size of inserts in this library
>

<INSERT the insert concept links together the two reads obtained from opposite ends of a clone insert
ID="ins_1" generic identifier for the insert
NAME="GALBZ92" verbose name for the insert
>

<SEQUENCE each insert contains two sequence reads obtained from to the opposite ends
ID="seq_1" generic identifier for the sequence
NAME="GALBZ92TF" verbose name for the sequence
>

<CONTIG Each contig in the assembly needs to be represented in the file
ID="contig_1" contig identifier
NAME= "1" contig name (usually the same as the identifier)
LEN="12352" contig length in basepairs
>

<SEQUENCE each sequence in the contig is listed together with information about the position within the contig.
ID="seq_1" sequence identifier (must match one described in the library section)
ORI="BE" orientation of sequence: BE - forward, EB - reverse
ASM_LEND="0" coordinate of sequence's left end within the contig
ASM_REND="525" coordinate of sequence's right end within the contig
>

<LINK this is a generic link between two contigs (i.e. not inferred from mate-pair data)
ID = "link_1" link identifier
SIZE = "-800" size of gap between the two contigs
TYPE = "MUMmer" link type: links with the same type get grouped into a virtual "library" used in specifying priorities
>

<CONTIG each link contains two contigs
ID = "contig_1" contig identifier: must match one described in the contig section above
ORI = "EB" contig orientation: BE - forward, EB - reverse
> each contig record may contain free-form data specifying the evidence for linking: e.g. alignment data

=== .out.xml ===

Represents, together with the evidence file, a description of the layout of the contigs. For each scaffold, the order (given as a coordinate along a chromosome) and the orientation of each contig is presented, together with a list of all the links used to generate this layout. Some links are deemed invalid, being given a code of "LEN" in case the length was deemed incorrect, or "ORI" in case the link orientation was considered incorrect. This file is also specified by the [[bambus.dtd|DTD]] and an example ([[bambus.out.xml|data/sample_files/bambus.out.xml]]).

=== .stats file ===

Finally, BAMBUS outputs a summary of the scaffolds generated. This file is pretty much self-explanatory, as evidenced by this example ([[bambus.stats|data/sample_files/bambus.stats]]). Note that in the .stats file, the N50 sizes are computed with respect to the total span of the scaffolds unless a different genome size is specified in a file called genome.size.

=== .details file ===

The .details file contains detailed information about the linking information between adjacent contigs. The contigs' orientation, size, and coordinates are listed and then all the linking data grouped by validity and library. An example is provided in [[bambus.details|data/sample_files/bambus.details]].

=== .dot file ===

Represents a graphical representation of the scaffolds in GraphViz format. Please see the [http://www.research.att.com/sw/tools/graphviz/ AT&T GraphViz] website for more information on this file format.

=== .ps file ===

A postscript image generated from the GraphViz-formatted file. It can be obtained from the .dot file with the command:
dot -Tps -o prefix.ps prefix.dot

An example is given in the picture below.
[[Image:bambus-display.jpg]]

Each scaffold is placed in a box, labeled with some statistics on the scaffold size (number of contigs, number of bases and span). All the "gaps" - linking relationships between the contigs are represented as edges. Each edge is decorated with the number of links contributing to it, the number of links from each library type, and the number of links invalidated due to incorrect length (L) or orientation (O). Each contig's ID, size, and coordinates within the scaffold are also listed.

=== .oo file ===

This file contains a summary of the order and orientation of all contigs present in the data. Each scaffold starts with a FASTA-like header containing the identifier for the scaffold followed by the number of contigs, size and span. Within each scaffold, all contigs are listed in the scaffold order, followed by the string BE for those in the forward orientation and EB for those in the reverse orientation. An example file is shown in: [[bambus.oo|data/sample_files/bambus.oo]].

=== .sum file ===

This file contains one line for each scaffold. Each line contains the scaffold ID, number of contigs, scaffold size, and span. The four values are separated by TAB characters. See an example in [[bambus.sum|data/sample_files/bambus.sum]].

== Known problems ==
There is a small "off-by-one" error in computing contig coordinates. It should not really affect the usefulness of the output.

== Contact information ==

BAMBUS is currently provided AS-IS, in other words we do not provide any support for the software. We would, however, like to hear your comments and suggestions. For Bambus bug reports, support requests, or any other inquiries please browse our SourceForge project page or Email us at:

amos-help (at) lists (dot) sourceforge (dot) net

BAMBUS was written by Mihai Pop and Dan Kosack.

AMOS Getting Started

2010-12-16T14:04:27Z

Dmb000006: Adding this header to the download section should make the equivelent section in the installation section more visible... I guess...

{{TOC}}

Is AMOS an assembler? is one of the first questions we are asked. The short answer is no. AMOS is not an assembler, rather a software infrastructure for developing assembly tools. If you are only interested in running an off-the-shelf assembler on your shotgun data, do not despair, AMOS provides two such assemblers: AMOScmp - a comparative assembler; and Minimus - a basic assembler for small datasets. However it is important to realize that, with a little bit of programming, you can use AMOS to put together your own shotgun assembler customized for the specific characteristics of your data.

This page will provide you with the basic information needed to get started using AMOS. Advanced AMOS users can go directly to in-depth resources from the main page [[AMOS]].

== Downloading AMOS ==
AMOS can be downloaded from Sourceforge using the following link: [http://sourceforge.net/project/showfiles.php?group_id=134326 http://sourceforge.net/project/showfiles.php?group_id=134326]

No need to remember this URL as you can easily reach it from the [AMOS main page].

This link will bring you to the Sourceforge download page for our project. While older versions of our code are also available for download from this page we recommend you download the latest version to take advantage of the full functionality of the code.

AMOS is released as a source-code package, with the exception of the OSX version of the assembly viewer Hawkeye, that can be downloaded as a binary from the File Release section of the download page. Instructions for compiling and installing AMOS are provided below.

=== Downloading the development version ===

If you want the bleeding-edge of AMOS, e.g. to edit the source code, you should download the development version of AMOS using CVS following the directions here: [http://sourceforge.net/scm/?type=cvs&group_id=134326 http://sourceforge.net/scm/?type=cvs&group_id=134326]

Or in short:
cvs -z3 -d:pserver:anonymous@amos.cvs.sourceforge.net:/cvsroot/amos co -P AMOS

== Installing AMOS ==
After reading this section make sure you also read the INSTALL file distributed with AMOS. This file may contain information pertaining to the latest version of AMOS that is not included here.

=== Installing the development version ===

The first step to install the CVS version of AMOS is to type:
./bootstrap

Then proceed with the instructions for the normal installation below.

=== Normal installation ===
The AMOS source package has a name like: amos-1.4.5.tar.gz where 1.4.5 is the version of the code. Once you untar this file (using "tar -xzf amos-1.4.5.tar.gz" in Linux, or "gunzip -d amos-1.4.5.tar.gz | tar xf -" in other flavors of Unix) you will find the current AMOS distribution in a directory named amos-1.4.5. The next steps assume you have cd'd into this directory.

AMOS uses the [http://www.gnu.org/software/autoconf GNU autoconf] package to reduce cross-platform compatibility issues. Before compiling the code you will need to run the configure script that will probe your system for the locations of all software packages required by AMOS.

By simply running:

./configure

you will prepare AMOS to be installed in the directory hosting the source package. This is OK if you are just testing AMOS. We recommend, however, that you provide the configure script with a more permanent home for AMOS, e.g.:

./configure --prefix=/usr/local

will ultimately lead the AMOS directory hierarchy to be installed underneath /usr/local/.

After running configure, make sure you check the messages left on your screen to make sure no errors occured. Errors during the configure step can lead to an incomplete build.

To compile the code you need to simply run:

make

followed by

make install

to install AMOS into the directory selected with the --prefix option to configure.

Normally, these steps are sufficient to install AMOS on most UNIX systems. If you encounter errors during configuration or compilation, or if you are trying to install AMOS on an OSX or Cygwin system, please read the following sub-sections.

=== Specifying the location of MUMmer ===
If the configure script gives you a message like:

WARNING! nucmer was not found but is required to run AMOScmp
install nucmer if planning on using AMOScmp

you either have not installed the [http://mummer.sourceforge.net/ MUMmer] package, or you have installed it in a location where the configure script cannot find it. MUMmer (the nucmer program in particular) is required by the comparative assembler [[AMOScmp]].

To remedy this situation, please install MUMmer following instructions found at [http://mummer.sourceforge.net http://mummer.sourceforge.net].

If MUMmer is already installed, but configure cannot find it, you can specify the location of the nucmer program by setting the environment variably NUCMER, e.g.:

NUCMER=/usr/local/bin/mummer/nucmer
export NUCMER

in a "traditional" shell (sh, bash, ksh, etc.), or

setenv NUCMER /usr/local/bin/mummer/nucmer

in csh or tcsh. Of course you'll need to replace /usr/local/bin/mummer/nucmer with the actual location of this program on your system.
Specifying the location of the QT library
On most Unix installations (see below for OSX and Cygwin), the QT library should be properly installed and AMOS will make without any problems. If, however, you notice a message like:

WARNING! Qt3 toolkit was not found but is required to run AMOS GUIs

the configure process was not able to find the QT library on your system. Check with your system administrator to have this toolkit installed on your system. If, however, you are certain the toolkit is installed, but AMOS still didn't find it, you can directly specify the location of the toolkit directory, or specifically the include, bin, and lib directories, where QT is installed, and the name of the library file, using the following options to the configure script:

--with_Qt_dir
--with_Qt_include_dir
--with_Qt_lib_dir
--with_Qt_bin_dir
--with_Qt_lib

=== Ubuntu installation ===
[[Ubuntu installation]]

=== Fedora installation ===
[[Fedora installation]]

=== OSX installation ===

[[OSX installation]]

=== Cygwin installation ===
[[Cygwin installation]]

== Running AMOS ==

=== Basic AMOS concepts ===
AMOS consists of a collection of modules that operate on a central data-structure called a bank. A bank is really just a directory that contains a database (organized as a collection of indexed files) comprising assembly related objects such as reads, contigs, scaffolds, etc. The modules thus communicate with each other by making changes to the bank. For example, an assembler might consist of three modules: an overlapper, a contigger, and a multi-aligner. The overlapper will first read the shotgun reads from the bank, compare them to each other and write back to the bank a list of overlaps, i.e. pairs of reads that match each other. The contigger then reads the collection of overlaps and makes sense out of it, by producing a layout of the reads that is consistent with most of the observed overlaps. The contigger then writes these contigs (contiguous chunks of the genome) to the bank. Finally, the multi-aligner reads from the bank both the reads and the contigs, builds a multiple alignment of the reads, using as a guide the layout of the reads produced by the contigger, then updates the contigs with the detailed alignment information. Thus, the three programs were able to communicate with each other using the bank as an intermediate storage space. If this litle description didn't make much sense to you, check out our [http://www.cbcb.umd.edu/research/assembly_primer.shtml Genome Assembly Primer]. It also has pointers to future reading.

Objects in the bank may be identified by one, or both of the following identifiers: IID (internal identifier) - an integer identifier, internal to AMOS; and EID (external identifier) - a string representing some external identifier of the record, e.g. the original name of a sequencing read. Both identifiers must be unique for a specific object type, but may be shared by multiple objects. For example, there can only be one contig with an IID equal to 1, however there can be both a contig, and a read, and an overlap, all with the IID = 1.
Message files
The AMOS banks are not the only mechanism for AMOS modules to communicate with each other, and to the "outside world". AMOS also uses a flat-file format (AMOS message files) inspired by the format used in Celera Assembler. This format is generally used as an intermediate format for converting to and from external file formats. The AMOS message files are then used to populate the data-structures present in a bank.

For more details on the AMOS message file format check out the [[Infrastructure]] pages. The use of message files will be described in more detail in the remainder of this tutorial.

==== Reading and writing banks ====
To learn how to generate AMOS message files check out the section called Creating inputs for AMOS. Assuming you already have an AMOS message file, most of the modules will require that the information from this file be loaded into a bank. This section describes the commands used to transfer information between a bank and the message file.

The command bank-transact can be used to load a message file into a bank. In its simplest invocation:

bank-transact -b mybank -m mymessagefile

bank-transact loads the messages in mymessagefile into the bank mybank. Note that this invocation assume the bank already exists, and bank-transact will fail otherwise. When creating a new bank you can run:

bank-transact -c -b mybank -m mymessagefile

The option -c stands for "create". By also providing the option -f (force), the bank will be overwritten if it already exists.

The contents of a bank can be output into a flat-file format with the command:

bank-report -b mybank

By default bank-report outputs all the data in the bank. The output can be restricted to certain message types by providing the 3 letter codes of the messages to be output, e.g:

bank-report -b mybank CTG RED

will output all the contigs (CTG) and read (RED) records. In addition bank-report allows the user to specify a list of EIDs (option -E) or a list of IIDs (option -I) that will be reported.

==== Bank locking ====
To allow concurrent access to the bank, AMOS programs lock the bank while the operate on it. There are two types of locks: for reading, and writing. If a bank is locked for reading, other read accesses are allowed but no writes. If a bank is locked for writing, no concurrent accesses are allowed. Some of the AMOS tools (such as the viewer Hawkeye), have an option to load a bank in "inspect" mode, i.e. the code ignores any locks placed on the bank.

In certain situations, if a program accessing the bank crashes, the bank may remain locked, prohibiting further access. All existing locks can be removed with the command (make sure that another user is not accessing the same bank):

bank-unlock mybank

==== Bank versions ====
The specific format of the AMOS bank is closely related to the current version of the AMOS software. The banks are not backward compatible, i.e., a bank produced by AMOS 1.0 will not be readable by AMOS 1.5. A simple solution for reading a bank created by an older version of AMOS is to output the contents of the bank using bank-report (the AMOS distribution contains old versions of the bank-report code, e.g. bank-report-1.1) , then reload the bank with the most recent bank-transact command.

==== Pipelines ====
As it has hopefully become clear from the introduction to AMOS above, most genome assembly tasks involve the sequential execution of several modules, in an assembly line (or pipeline) fashion. AMOS provides a mechanism for quickly putting together simple pipelines. By "simple" we mean situations where the specific assembly task involves running several programs in order, without the need for more complex control structures such as "if" statements or loops. To implement complex pipelines you will have to rely on Perl or another complex programming language.

An AMOS pipelines are described in a simple interpreted language, and consist of a series of steps that are executed in order. The steps are meant to provide a logical breakdown of the individual assembly tasks, representing the execution of one or more programs. Each step in a pipeline is identified by a step number (a throw-back to the days of the Basic language) providing the user with a mechanism to execute only some of the steps of a pipeline.

To learn more about AMOS pipelines and how to write them, check out the documentation for [[runAmos]] (the pipeline executor), or check out one of the pipelines distributed with AMOS (AMOScmp and minimus are good starting points).

=== Creating inputs for AMOS ===
The inputs to most AMOS programs must be provided in the AMOS message format. For help converting non-AMOS file formats into message files see the [[File conversion utilities]].

=== Running AMOScmp ===
AMOScmp is a comparative assembler that can be used to assemble reads from one genome (called the target) using as a template the sequence of a related genome (called the reference). Read the AMOScmp documentation for a detailed description of this program.

By default, running AMOScmp as follows:

AMOScmp prefix

assumes that the target is provided in the AMOS message file prefix.afg, and the reference in the file prefix.1con. To use different file locations, you can set the variables TGT and REF, either directly within the AMOScmp script, or on the command line:

AMOScmp -D "TGT=mytarget.afg" -D "REF=myreference.1con" prefix

The prefix must still be provided as it is used to generate the name of the output files.

AMOScmp will populate a bank named prefix.bnk, and will load into it a set of contigs, as well as a scaffold, linking together contigs that are adjacent along the reference. In addition, AMOScmp outputs the set of contigs as both a multi-FASTA file prefix.fasta, and a TIGR .contig file prefix.contig. Note that the consensus of the contigs (reported in the FASTA file) is generated from the target genome, and may differ from the reference genome (after all, the goal of the assembler is to assemble the target). In fact, AMOScmp uses sophisticated algorithms for detecting differences between the target and reference in order to prevent misassemblies. For more information refer to:

M. Pop, A. Phillippy, A.L. Delcher and S.L. Salzberg. [http://www.cbcb.umd.edu/papers/Pop%20et%20al%20Comparative.pdf Comparative genome assembly]. Briefings in Bioinformatics. 5(3), pp. 237-248, 2004.

=== Running minimus ===
Minimus is a basic genome assembler that can be used for small assembly jobs (e.g. a single gene, or a viral genome). Minimus is currently used as a central component of the Influenza A sequencing pipeline at The Institute for Genomic Research. Read the [[minimus]] documentation for more information.

To run minimus you must provide a set of shotgun reads in an AMOS message file. Running:

minimus prefix

assumes the input is in file prefix.afg. After running, minimus populates the bank prefix.bnk with a set of contigs, furthermore it reports the contigs in both a FASTA file (prefix.fasta) and a TIGR .contig file (prefix.contig). Note that minimus does not use mate-pairs. In essence it is, in Celera Assembler terminology, a unitigger. Any mate-pair information provided in the .afg will be silently ignored.

=== Viewing the result of an assembly ===
The content of a bank can be viewed with a program called Hawkeye:

hawkeye mybank

For detailed information on how to use Hawkeye, refer to the [[Hawkeye]] documentation.

=== Validating assemblies ===
Even the best genome assemblers sometimes make mistakes. AMOS provides a mechanism to run several checks on the output of an assembler (assuming the data are already stored in a bank), through a script called amosvalidate. Amosvalidate runs through the assembly and identifies several types of inconsistencies, such as clusters of SNPs in the assembled reads, clusters of mate-pairs that are too close or too far from each other (with respect to the estimated library sizes), and unassembled reads that do not properly match the assembly. A full description of these measures is beyond the scope of this document. We are currently submitting a manuscript describing the tools included in amosvalidate and will update this page when it gets published.

All the potential assembly problems identified by amosvalidate are written back into the bank as features, i.e ranges along the assembly. Each feature is tagged with the problem that was identified in that region. Typically, users then load the assembly in the Hawkeye viewer and examine the assembly in the tagged regions. Alternatively, the features may be extracted from the bank and processed automatically by specialized software (e.g. several assemblies of a same genome can be compared by the number of features identified in the assembly - the assembly with fewer features is likely "better").

Running amosvalidate is as simple as:

amosvalidate prefix

where prefix.bnk is the location of the bank.

== Getting help ==
To report bugs in AMOS, or to get help, email us at:

amos-help (at) lists (dot) sourceforget (dot) net

To receive information regarding new releases and developments, please [http://lists.sourceforge.net/lists/listinfo/amos-users subscribe] to our moderated, low-traffic users' mailing list:

amos-users (at) lists (dot) sourceforget (dot) net

AMOS Getting Started

2010-12-16T14:02:15Z

Dmb000006: Switching out the explicit right aligned TOC for the equivelent template call

{{TOC}}

Is AMOS an assembler? is one of the first questions we are asked. The short answer is no. AMOS is not an assembler, rather a software infrastructure for developing assembly tools. If you are only interested in running an off-the-shelf assembler on your shotgun data, do not despair, AMOS provides two such assemblers: AMOScmp - a comparative assembler; and Minimus - a basic assembler for small datasets. However it is important to realize that, with a little bit of programming, you can use AMOS to put together your own shotgun assembler customized for the specific characteristics of your data.

This page will provide you with the basic information needed to get started using AMOS. Advanced AMOS users can go directly to in-depth resources from the main page [[AMOS]].

== Downloading AMOS ==
AMOS can be downloaded from Sourceforge using the following link: [http://sourceforge.net/project/showfiles.php?group_id=134326 http://sourceforge.net/project/showfiles.php?group_id=134326]

No need to remember this URL as you can easily reach it from the [AMOS main page].

This link will bring you to the Sourceforge download page for our project. While older versions of our code are also available for download from this page we recommend you download the latest version to take advantage of the full functionality of the code.

AMOS is released as a source-code package, with the exception of the OSX version of the assembly viewer Hawkeye, that can be downloaded as a binary from the File Release section of the download page. Instructions for compiling and installing AMOS are provided below.

If you want the bleeding-edge of AMOS, e.g. to edit the source code, you should download the development version of AMOS using CVS following the directions here: [http://sourceforge.net/scm/?type=cvs&group_id=134326 http://sourceforge.net/scm/?type=cvs&group_id=134326]

Or in short:
cvs -z3 -d:pserver:anonymous@amos.cvs.sourceforge.net:/cvsroot/amos co -P AMOS

== Installing AMOS ==
After reading this section make sure you also read the INSTALL file distributed with AMOS. This file may contain information pertaining to the latest version of AMOS that is not included here.

=== Installation of the development version ===

The first step to install the CVS version of AMOS is to type:
./bootstrap

Then proceed with the instructions for the normal installation below.

=== Normal installation ===
The AMOS source package has a name like: amos-1.4.5.tar.gz where 1.4.5 is the version of the code. Once you untar this file (using "tar -xzf amos-1.4.5.tar.gz" in Linux, or "gunzip -d amos-1.4.5.tar.gz | tar xf -" in other flavors of Unix) you will find the current AMOS distribution in a directory named amos-1.4.5. The next steps assume you have cd'd into this directory.

AMOS uses the [http://www.gnu.org/software/autoconf GNU autoconf] package to reduce cross-platform compatibility issues. Before compiling the code you will need to run the configure script that will probe your system for the locations of all software packages required by AMOS.

By simply running:

./configure

you will prepare AMOS to be installed in the directory hosting the source package. This is OK if you are just testing AMOS. We recommend, however, that you provide the configure script with a more permanent home for AMOS, e.g.:

./configure --prefix=/usr/local

will ultimately lead the AMOS directory hierarchy to be installed underneath /usr/local/.

After running configure, make sure you check the messages left on your screen to make sure no errors occured. Errors during the configure step can lead to an incomplete build.

To compile the code you need to simply run:

make

followed by

make install

to install AMOS into the directory selected with the --prefix option to configure.

Normally, these steps are sufficient to install AMOS on most UNIX systems. If you encounter errors during configuration or compilation, or if you are trying to install AMOS on an OSX or Cygwin system, please read the following sub-sections.

=== Specifying the location of MUMmer ===
If the configure script gives you a message like:

WARNING! nucmer was not found but is required to run AMOScmp
install nucmer if planning on using AMOScmp

you either have not installed the [http://mummer.sourceforge.net/ MUMmer] package, or you have installed it in a location where the configure script cannot find it. MUMmer (the nucmer program in particular) is required by the comparative assembler [[AMOScmp]].

To remedy this situation, please install MUMmer following instructions found at [http://mummer.sourceforge.net http://mummer.sourceforge.net].

If MUMmer is already installed, but configure cannot find it, you can specify the location of the nucmer program by setting the environment variably NUCMER, e.g.:

NUCMER=/usr/local/bin/mummer/nucmer
export NUCMER

in a "traditional" shell (sh, bash, ksh, etc.), or

setenv NUCMER /usr/local/bin/mummer/nucmer

in csh or tcsh. Of course you'll need to replace /usr/local/bin/mummer/nucmer with the actual location of this program on your system.
Specifying the location of the QT library
On most Unix installations (see below for OSX and Cygwin), the QT library should be properly installed and AMOS will make without any problems. If, however, you notice a message like:

WARNING! Qt3 toolkit was not found but is required to run AMOS GUIs

the configure process was not able to find the QT library on your system. Check with your system administrator to have this toolkit installed on your system. If, however, you are certain the toolkit is installed, but AMOS still didn't find it, you can directly specify the location of the toolkit directory, or specifically the include, bin, and lib directories, where QT is installed, and the name of the library file, using the following options to the configure script:

--with_Qt_dir
--with_Qt_include_dir
--with_Qt_lib_dir
--with_Qt_bin_dir
--with_Qt_lib

=== Ubuntu installation ===
[[Ubuntu installation]]

=== Fedora installation ===
[[Fedora installation]]

=== OSX installation ===

[[OSX installation]]

=== Cygwin installation ===
[[Cygwin installation]]

== Running AMOS ==

=== Basic AMOS concepts ===
AMOS consists of a collection of modules that operate on a central data-structure called a bank. A bank is really just a directory that contains a database (organized as a collection of indexed files) comprising assembly related objects such as reads, contigs, scaffolds, etc. The modules thus communicate with each other by making changes to the bank. For example, an assembler might consist of three modules: an overlapper, a contigger, and a multi-aligner. The overlapper will first read the shotgun reads from the bank, compare them to each other and write back to the bank a list of overlaps, i.e. pairs of reads that match each other. The contigger then reads the collection of overlaps and makes sense out of it, by producing a layout of the reads that is consistent with most of the observed overlaps. The contigger then writes these contigs (contiguous chunks of the genome) to the bank. Finally, the multi-aligner reads from the bank both the reads and the contigs, builds a multiple alignment of the reads, using as a guide the layout of the reads produced by the contigger, then updates the contigs with the detailed alignment information. Thus, the three programs were able to communicate with each other using the bank as an intermediate storage space. If this litle description didn't make much sense to you, check out our [http://www.cbcb.umd.edu/research/assembly_primer.shtml Genome Assembly Primer]. It also has pointers to future reading.

Objects in the bank may be identified by one, or both of the following identifiers: IID (internal identifier) - an integer identifier, internal to AMOS; and EID (external identifier) - a string representing some external identifier of the record, e.g. the original name of a sequencing read. Both identifiers must be unique for a specific object type, but may be shared by multiple objects. For example, there can only be one contig with an IID equal to 1, however there can be both a contig, and a read, and an overlap, all with the IID = 1.
Message files
The AMOS banks are not the only mechanism for AMOS modules to communicate with each other, and to the "outside world". AMOS also uses a flat-file format (AMOS message files) inspired by the format used in Celera Assembler. This format is generally used as an intermediate format for converting to and from external file formats. The AMOS message files are then used to populate the data-structures present in a bank.

For more details on the AMOS message file format check out the [[Infrastructure]] pages. The use of message files will be described in more detail in the remainder of this tutorial.

==== Reading and writing banks ====
To learn how to generate AMOS message files check out the section called Creating inputs for AMOS. Assuming you already have an AMOS message file, most of the modules will require that the information from this file be loaded into a bank. This section describes the commands used to transfer information between a bank and the message file.

The command bank-transact can be used to load a message file into a bank. In its simplest invocation:

bank-transact -b mybank -m mymessagefile

bank-transact loads the messages in mymessagefile into the bank mybank. Note that this invocation assume the bank already exists, and bank-transact will fail otherwise. When creating a new bank you can run:

bank-transact -c -b mybank -m mymessagefile

The option -c stands for "create". By also providing the option -f (force), the bank will be overwritten if it already exists.

The contents of a bank can be output into a flat-file format with the command:

bank-report -b mybank

By default bank-report outputs all the data in the bank. The output can be restricted to certain message types by providing the 3 letter codes of the messages to be output, e.g:

bank-report -b mybank CTG RED

will output all the contigs (CTG) and read (RED) records. In addition bank-report allows the user to specify a list of EIDs (option -E) or a list of IIDs (option -I) that will be reported.

==== Bank locking ====
To allow concurrent access to the bank, AMOS programs lock the bank while the operate on it. There are two types of locks: for reading, and writing. If a bank is locked for reading, other read accesses are allowed but no writes. If a bank is locked for writing, no concurrent accesses are allowed. Some of the AMOS tools (such as the viewer Hawkeye), have an option to load a bank in "inspect" mode, i.e. the code ignores any locks placed on the bank.

In certain situations, if a program accessing the bank crashes, the bank may remain locked, prohibiting further access. All existing locks can be removed with the command (make sure that another user is not accessing the same bank):

bank-unlock mybank

==== Bank versions ====
The specific format of the AMOS bank is closely related to the current version of the AMOS software. The banks are not backward compatible, i.e., a bank produced by AMOS 1.0 will not be readable by AMOS 1.5. A simple solution for reading a bank created by an older version of AMOS is to output the contents of the bank using bank-report (the AMOS distribution contains old versions of the bank-report code, e.g. bank-report-1.1) , then reload the bank with the most recent bank-transact command.

==== Pipelines ====
As it has hopefully become clear from the introduction to AMOS above, most genome assembly tasks involve the sequential execution of several modules, in an assembly line (or pipeline) fashion. AMOS provides a mechanism for quickly putting together simple pipelines. By "simple" we mean situations where the specific assembly task involves running several programs in order, without the need for more complex control structures such as "if" statements or loops. To implement complex pipelines you will have to rely on Perl or another complex programming language.

An AMOS pipelines are described in a simple interpreted language, and consist of a series of steps that are executed in order. The steps are meant to provide a logical breakdown of the individual assembly tasks, representing the execution of one or more programs. Each step in a pipeline is identified by a step number (a throw-back to the days of the Basic language) providing the user with a mechanism to execute only some of the steps of a pipeline.

To learn more about AMOS pipelines and how to write them, check out the documentation for [[runAmos]] (the pipeline executor), or check out one of the pipelines distributed with AMOS (AMOScmp and minimus are good starting points).

=== Creating inputs for AMOS ===
The inputs to most AMOS programs must be provided in the AMOS message format. For help converting non-AMOS file formats into message files see the [[File conversion utilities]].

=== Running AMOScmp ===
AMOScmp is a comparative assembler that can be used to assemble reads from one genome (called the target) using as a template the sequence of a related genome (called the reference). Read the AMOScmp documentation for a detailed description of this program.

By default, running AMOScmp as follows:

AMOScmp prefix

assumes that the target is provided in the AMOS message file prefix.afg, and the reference in the file prefix.1con. To use different file locations, you can set the variables TGT and REF, either directly within the AMOScmp script, or on the command line:

AMOScmp -D "TGT=mytarget.afg" -D "REF=myreference.1con" prefix

The prefix must still be provided as it is used to generate the name of the output files.

AMOScmp will populate a bank named prefix.bnk, and will load into it a set of contigs, as well as a scaffold, linking together contigs that are adjacent along the reference. In addition, AMOScmp outputs the set of contigs as both a multi-FASTA file prefix.fasta, and a TIGR .contig file prefix.contig. Note that the consensus of the contigs (reported in the FASTA file) is generated from the target genome, and may differ from the reference genome (after all, the goal of the assembler is to assemble the target). In fact, AMOScmp uses sophisticated algorithms for detecting differences between the target and reference in order to prevent misassemblies. For more information refer to:

M. Pop, A. Phillippy, A.L. Delcher and S.L. Salzberg. [http://www.cbcb.umd.edu/papers/Pop%20et%20al%20Comparative.pdf Comparative genome assembly]. Briefings in Bioinformatics. 5(3), pp. 237-248, 2004.

=== Running minimus ===
Minimus is a basic genome assembler that can be used for small assembly jobs (e.g. a single gene, or a viral genome). Minimus is currently used as a central component of the Influenza A sequencing pipeline at The Institute for Genomic Research. Read the [[minimus]] documentation for more information.

To run minimus you must provide a set of shotgun reads in an AMOS message file. Running:

minimus prefix

assumes the input is in file prefix.afg. After running, minimus populates the bank prefix.bnk with a set of contigs, furthermore it reports the contigs in both a FASTA file (prefix.fasta) and a TIGR .contig file (prefix.contig). Note that minimus does not use mate-pairs. In essence it is, in Celera Assembler terminology, a unitigger. Any mate-pair information provided in the .afg will be silently ignored.

=== Viewing the result of an assembly ===
The content of a bank can be viewed with a program called Hawkeye:

hawkeye mybank

For detailed information on how to use Hawkeye, refer to the [[Hawkeye]] documentation.

=== Validating assemblies ===
Even the best genome assemblers sometimes make mistakes. AMOS provides a mechanism to run several checks on the output of an assembler (assuming the data are already stored in a bank), through a script called amosvalidate. Amosvalidate runs through the assembly and identifies several types of inconsistencies, such as clusters of SNPs in the assembled reads, clusters of mate-pairs that are too close or too far from each other (with respect to the estimated library sizes), and unassembled reads that do not properly match the assembly. A full description of these measures is beyond the scope of this document. We are currently submitting a manuscript describing the tools included in amosvalidate and will update this page when it gets published.

All the potential assembly problems identified by amosvalidate are written back into the bank as features, i.e ranges along the assembly. Each feature is tagged with the problem that was identified in that region. Typically, users then load the assembly in the Hawkeye viewer and examine the assembly in the tagged regions. Alternatively, the features may be extracted from the bank and processed automatically by specialized software (e.g. several assemblies of a same genome can be compared by the number of features identified in the assembly - the assembly with fewer features is likely "better").

Running amosvalidate is as simple as:

amosvalidate prefix

where prefix.bnk is the location of the bank.

== Getting help ==
To report bugs in AMOS, or to get help, email us at:

amos-help (at) lists (dot) sourceforget (dot) net

To receive information regarding new releases and developments, please [http://lists.sourceforge.net/lists/listinfo/amos-users subscribe] to our moderated, low-traffic users' mailing list:

amos-users (at) lists (dot) sourceforget (dot) net

Template:TOC

2010-12-16T14:01:44Z

Dmb000006:

<noinclude>Template for generating a right aligned TOC

== Template code ==
</noinclude>
{| align="right"
| __TOC__
|}

Template:TOC

2010-12-16T13:58:03Z

Dmb000006: Templates are good

<noinclude>Template for generating a right aligned TOC

== Template code ==
</noinclude><includeonly>
{| align="right"
| __TOC__
|}
</includeonly>

Bambus 2.0/quick start guide

2010-12-16T13:27:20Z

Dmb000006: /* Outputs */

This is a copy of the Bambus 2 user guide taken (and improved) from here: http://www.cbcb.umd.edu/software/bambus/doc/HowToBambus2.pdf

See also: http://www.cbcb.umd.edu/software/bambus

==How to run Bambus 2.0==
'''Caveat:''' Bambus is still being actively developed and the code is currently in the "user beware" and "for experts only" stage.

=== Step 1. Install the AMOS package - Bambus 2.0 is part of it. ===
See http://sourceforge.net/apps/mediawiki/amos/index.php?title=AMOS_Getting_Started

'''Note:''' since Bambus is still under active development you should pull the latest unofficial release of AMOS directly from the CVS repository - see instructions at: http://sourceforge.net/apps/mediawiki/amos/index.php?title=Programmer%27s_guide

=== Step 2. What information you need ===
Bambus needs to know about the contigs produced by the assembler and information about how these contigs are linked to each other. In AMOS terms, the basic information necessary are a list of contigs (http://amos.sourceforge.net/docs/api/classAMOS_1_1Contig__t.html) and a list of contig links (http://amos.sourceforge.net/docs/api/classAMOS_1_1ContigLink__t.html) or contig edges (http://amos.sourceforge.net/docs/api/classAMOS_1_1ContigEdge__t.html - bundles of consistent contig links) indicating the relative placement of pairs of contigs.

These data can either be provided to Bambus directly in the form of a AMOS message file (see http://sourceforge.net/apps/mediawiki/amos/index.php?title=Message_Types) or inferred from mate-pair information as described below.

== Running Bambus 2.0 ==
* First, add the .afg file built as described above (for other conversion utilities see: http://sourceforge.net/apps/mediawiki/amos/index.php?title=File_conversion_utilities) to an AMOS bank (flat-file database):
bank-transact -cf myproj.bnk -m myfile.afg

* Use the mate-pair information to construct a collection of contig links.
clk -b myproj.bnk

'''Note:''' that you can also construct these links with your own custom software and upload them into the bank in which case you would skip the "clk" command.

* Bundle the contig links into a collection of contig edges.
Bundler -b myproj.bnk

'''Note:''' as with the clk command you might want to build the contig edges separately and upload them into the bank using your own software.

'''Note:''' the Bundler command also accepts the command line parameter "-t" followed by a list of edge types as defined in src/AMOS/Link_AMOS.hh. Currently the following types are defined: M- mate-pair, O - overlap, P - physical, A - alignment, S - synteny, X - other.

* Identify genomic repeats and output them to std out
MarkRepeats -b myproj.bnk [-redundancy X -aggressive] > myRepeats

Optional parameters:
:"-redundancy X" only uses contig edges comprising X or more contig links
:"-aggressive" - aggressive repeat identification based on global depth of coverage statistics (default procedure relies on graph analysis rather than coverage statistics)

'''Note:''' this program requires the boost library

* Order and orient contigs according to repeat and link information

'''IMPORTANT:''' several of the operations performed by this program destructively modify the bank (changes cannot be undone). You should make a copy of the bank prior to running OrientContigs.

OrientContigs -b myproj.bnk -prefix myscaff

:"-prefix" specifies the prefix for all output files

Optional parameters:
:"-all" - output unlinked contigs as scaffolds
:"-noreduce" - turns off graph simplification routines (see below)
:"-redundancy X" - same as above - ignore edges with less than X links
:"-repeats filename" - ignores repeats listed in "filename" (one contig ID per line) as generated, e.g. by the MarkRepeats :program described above.
:"-aggressive" - aggressive scaffolding - by default links that are stretched by more than 3 standard deviations are ignored. Aggressive option turns this feature off and tries to reconcile the scaffold as best possible.

* Linearize the scaffolds (if desired). By default Bambus 2 produces non-linear graph-based scaffolds. If fasta output is desired, it is necessary to linearize the scaffolds.
untangle -e myscaff.evidence.xml -s myscaff.out.xml -o myscaff.untangle.xml

* Output fasta result (if desired). This involves two steps, the first to generating the fasta file representing the contigs and the second combines them, separated by Ns, into a scaffold fasta file.
bank2fasta -d -b myproj.bnk > contigs.fasta
printScaff -e myscaff.evidence.xml -s myscaff.untangle.xml -l myscaff.library -f contigs.fasta -merge -o myscaff

== Outputs ==
The output of the OrientContigs program is a collection of scaffolds stored in the bank. The program also generates several files starting with the specified prefix
*myScaff.agp
**The scaffolds generated by the OrientContigs programs in NCBI AGP format
*myScaff.dot
**The scaffolds generated by the OrientContigs program in Graphviz dot format. It can be converted to a PostScript or PDF file using the dot program in the Graphviz package.
*myScaff.evidence.xml
*myScaff.library
*myScaff.out.xml
**The scaffolds generated by the OrientContigs program compatible with the Bambus 1 format.
*myScaff.fasta
**The fasta file of the scaffolds, joined by Ns
*myScaff.stats
**Statistics on the scaffolds generated, including N50 and total span.

=== Scaffold simplifications ===
By default (unless option "-noreduce" is provided) the OrientContigs program simplifies certain
graph patterns:
* simple paths
* bubbles
** These patterns are iteratively merged into single contigs until no additional simplifications can be made.

User talk:Dmb000006

2010-12-16T11:37:44Z

Dmb000006:

Get me at: [mailto:dan.bolser@gmail.com dan.bolser@gmail.com]

User talk:Dmb000006

2010-12-16T11:36:48Z

Dmb000006: Created page with 'Get me at: dan.bolser@gmail.com'

Get me at: dan.bolser@gmail.com

User:Dmb000006

2010-12-16T11:34:43Z

Dmb000006: Created page with 'http://openwetware.org/wiki/User:Dan_Bolser'

http://openwetware.org/wiki/User:Dan_Bolser

Bambus 2.0

2010-12-16T11:33:36Z

Dmb000006: Redirected page to Bambus2

#REDIRECT [[Bambus2]]

Bambus 2.0/quick start guide

2010-12-16T11:31:52Z

Dmb000006: Created page with 'This is a copy of the Bambus 2 user guide taken (and improved) from here: http://www.cbcb.umd.edu/software/bambus/doc/HowToBambus2.pdf See also: http://www.cbcb.umd.edu/software…'

This is a copy of the Bambus 2 user guide taken (and improved) from here: http://www.cbcb.umd.edu/software/bambus/doc/HowToBambus2.pdf

See also: http://www.cbcb.umd.edu/software/bambus

==How to run Bambus 2.0==
'''Caveat:''' Bambus is still being actively developed and the code is currently in the "user beware" and "for experts only" stage.

=== Step 1. Install the AMOS package - Bambus 2.0 is part of it. ===
See http://sourceforge.net/apps/mediawiki/amos/index.php?title=AMOS_Getting_Started

'''Note:''' since Bambus is still under active development you should pull the latest unofficial release of AMOS directly from the CVS repository - see instructions at: http://sourceforge.net/apps/mediawiki/amos/index.php?title=Programmer%27s_guide

=== Step 2. What information you need ===
Bambus needs to know about the contigs produced by the assembler and information about how these contigs are linked to each other. In AMOS terms, the basic information necessary are a list of contigs (http://amos.sourceforge.net/docs/api/classAMOS_1_1Contig__t.html) and a list of contig links (http://amos.sourceforge.net/docs/api/classAMOS_1_1ContigLink__t.html) or contig edges (http://amos.sourceforge.net/docs/api/classAMOS_1_1ContigEdge__t.html - bundles of consistent contig links) indicating the relative placement of pairs of contigs.

These data can either be provided to Bambus directly in the form of a AMOS message file (see http://sourceforge.net/apps/mediawiki/amos/index.php?title=Message_Types) or inferred from mate-pair information as described below.

== Running Bambus 2.0 ==
* First, add the .afg file built as described above (for other conversion utilities see: http://sourceforge.net/apps/mediawiki/amos/index.php?title=File_conversion_utilities) to an AMOS bank (flat-file database):
bank-transact -cf myproj.bnk -m myfile.afg

* Use the mate-pair information to construct a collection of contig links.
clk -b myproj.bnk

'''Note:''' that you can also construct these links with your own custom software and upload them into the bank in which case you would skip the "clk" command.

* Bundle the contig links into a collection of contig edges.
Bundler -b myproj.bnk

'''Note:''' as with the clk command you might want to build the contig edges separately and upload them into the bank using your own software.

'''Note:''' the Bundler command also accepts the command line parameter "-t" followed by a list of edge types as defined in src/AMOS/Link_AMOS.hh. Currently the following types are defined: M- mate-pair, O - overlap, P - physical, A - alignment, S - synteny, X - other.

* Identify genomic repeats and output them to std out
MarkRepeats -b myproj.bnk [-redundancy X -aggressive] > myRepeats

Optional parameters:
:"-redundancy X" only uses contig edges comprising X or more contig links
:"-aggressive" - aggressive repeat identification based on global depth of coverage statistics (default procedure relies on graph analysis rather than coverage statistics)

'''Note:''' this program requires the boost library

* Order and orient contigs according to repeat and link information

'''IMPORTANT:''' several of the operations performed by this program destructively modify the bank (changes cannot be undone). You should make a copy of the bank prior to running OrientContigs.

OrientContigs -b myproj.bnk -prefix myscaff

:"-prefix" specifies the prefix for all output files

Optional parameters:
:"-all" - output unlinked contigs as scaffolds
:"-noreduce" - turns off graph simplification routines (see below)
:"-redundancy X" - same as above - ignore edges with less than X links
:"-repeats filename" - ignores repeats listed in "filename" (one contig ID per line) as generated, e.g. by the MarkRepeats :program described above.
:"-aggressive" - aggressive scaffolding - by default links that are stretched by more than 3 standard deviations are ignored. Aggressive option turns this feature off and tries to reconcile the scaffold as best possible.

* Linearize the scaffolds (if desired). By default Bambus 2 produces non-linear graph-based scaffolds. If fasta output is desired, it is necessary to linearize the scaffolds.
untangle -e myscaff.evidence.xml -s myscaff.out.xml -o myscaff.untangle.xml

* Output fasta result (if desired). This involves two steps, the first to generating the fasta file representing the contigs and the second combines them, separated by Ns, into a scaffold fasta file.
bank2fasta -d -b myproj.bnk > contigs.fasta
printScaff -e myscaff.evidence.xml -s myscaff.untangle.xml -l myscaff.library -f contigs.fasta -merge -o myscaff

== Outputs ==
The output of the OrientContigs program is a collection of scaffolds stored in the bank. The program also generates several files starting with the specified prefix
*myScaff.agp
**The scaffolds generated by the OrientContigs programs in NCBI AGP format
*myScaff.dot
**The scaffolds generated by the OrientContigs program in Graphviz dot format. It can be converted to a PostScript or PDF file using the dot program in the Graphviz package.
*myScaff.evidence.xml
*myScaff.library
*myScaff.out.xml
*The scaffolds generated by the OrientContigs program compatible with the Bambus 1 format.
*myScaff.fasta
**The fasta file of the scaffolds, joined by Ns
*myScaff.stats
**Statistics on the scaffolds generated, including N50 and total span.

=== Scaffold simplifications ===
By default (unless option "-noreduce" is provided) the OrientContigs program simplifies certain
graph patterns:
* simple paths
* bubbles
** These patterns are iteratively merged into single contigs until no additional simplifications can be made.

Bambus2

2010-12-16T11:20:47Z

Dmb000006:

Getting data into Bambus 2 can be a pain. Here is my recipe:

[[toAmos]] \
-s my.fa \
-c my.contig \
-m my.mates \
-o my.afg

You need the .fa to list the contigs within the GFD-like contig file (annoying but true). You don't need accurate sequences in the .fa, you just need something to make the format valid. The .contig and .mates are as expected for [[Bambus]].

The resulting .afg is then 'banked' with:

[[bank-transact]] -c \
-b my.bnk \
-m my.afg

For more details, see the info here: http://www.cbcb.umd.edu/software/bambus/, which I have dumped here [[Bambus 2.0/quick start guide]].

Bambus2

2010-12-16T09:14:02Z

Dmb000006: Created page with 'Getting data into Bambus 2 can be a pain. Here is my recipe: toAmos \ -s my.fa \ -c my.contig \ -m my.mates \ -o my.afg You need the .fa to list the contigs withi…'

ToAmos

2010-12-14T18:12:30Z

Dmb000006: /* Options */ link for gde-like format contig file

ToAmos

2010-12-14T18:11:26Z

Dmb000006:

toAmos: converter from various types of inputs to AMOS messages

== Overview ==

toAmos is primarily designed for converting the output of an assembly program into the AMOS format so that it can be stored in an AMOS bank. toAmos can be used as a replacement for tarchive2amos however the latter is more flexible when converting from Trace Archive or simple .seq and .qual inputs.

== Synopsis ==

toAmos -o out_file
(-s fasta_reads (-q qual_file) (-gq good_qual) (-bq bad_qual))
(-c tigr_contig | -a celera_asm [-S][-utg] | -ta tigr_asm | -ace phrap_ace [-phd])
(-m bambus_mates | -x trace_xml | -f celera_frg [-acc])
(-arachne arachne_links | -scaff bambus_scaff)
(-i insert_file | -map dst_map)
(-pos pos_file)
(-id min_id)

toAmos reads the inputs specified on the command line and converts the information into AMOS message format. The following types of information can be provided to toAmos:

* Sequence and quality data (options -f, -s, -q, -gq, or -bq)
* Library and mate-pair data (options -m, -x, -f, -i, or -map)
* Contig data (options -c, -a, -ta, or -ace)
* Scaffold data (option -a)

== Options ==
{| class="somecssclass" border="1"
|-
| -o <out_file> || output filename ('-' for standard output)
|-
| -s <fasta_reads> || sequence data file in FASTA format (reads names ending in .1 or /1 are taken as mate pairs)
|-
| -q <qual_file> || sequence quality score file in QUAL format
|-
| -gq <bad_qual> || minimum quality score for high-quality bases (default: 30) - if no quality file provided bases within clear range are assigned this quality value
|-
| -bq <good_qual> || maximum quality score for low-quality bases (default: 10) - if no quality file provided bases outside the clear range are assigned this quality value (default 10)
|-
| -c <tigr_contig> || provide TIGR .contig file
|-
| -a <celera_asm> || use Celera Assembler .asm contig file (contig and scaffold information)
|-
| -S || include the surrogate unitigs in the .asm file as AMOS contigs
|-
| -utg || include all UTG unitig messages in the .asm file as AMOS contigs
|-
| -ta <tigr_asm> || contig file in TIGR Assembler format (.tasm) [http://www.cbcb.umd.edu/research/contig_representation.shtml]
|-
| -ace <phrap_ace> || contig file in Phred ACE format (can be accompanied by -q)
|-
| -phd || read the content of PHD file referenced in ACE files
|-
| -m <bambus_mates> || library and mate-pair information file in Bambus format
|-
| -x <trace_xml> || ancilliary data file (library, mate-pair, clear range) in Trace Archive XML format
|-
| -f <celera_frg> || library, mate-pair, sequence, quality, and clear range data file in Celera Assembler format
|-
| -acc || use accession numbers in FRG files
|-
| -arachne <arachne_links> || scaffold file in Arachne .links format
|-
| -scaff <bambus_scaff> || scaffold file in Bambus .scaff format
|-
| -map <dst_map> || read map information - mapping from internal library ID to external library ID useful in conjunction with the -f option. This file consists of space-separated records providing a mapping from the "acc:" field in "DST" records within the .frg file to an externally recognizable name for each library.
|-
| -pos <pos_file> || TIGR-style .pos position file
|-
| -id <min_id> || start numbering contigs at this number
|-
|}

== TIGR specific options (not too useful outside TIGR) ==

* -i <insert file> - use mapping from internal library ID to external library ID provided in a .insert file produced by pullfrag.

== Known issues ==

The -ta (TIGR Assembler input) and -ace (ACE formatted input) options have not been throughly tested and likely do not properly work. Contact us if either of these options is important to you.

== Errors ==
toAmos -c my.test.contig -m my.test.mates -o my.test.afg
Cannot find ID for sequence lid05.f

This problem is caused by...

Bank2contig

2010-12-14T18:10:44Z

Dmb000006: Link to contig formats and wp style

'''bank2contig''' is a general converter from AMOS banks into a variety of other contig formats.

For descriptions of format, see [http://www.cbcb.umd.edu/research/contig_representation.shtml]

== TIGR Assembler / GDE Contig Format ==

The .contig format is a simple text format for encoding read to contig alignments. This is the default output format for bank2contig. The layout format (-L) is the same as the contig format, except no sequence information is written. This is useful for listing the reads in each contig, their positions, clear ranges, etc.

Example:

##56487 19 1623 bases, 00000000 checksum.
TTAGACCCAGGAGAAG-CATAAAATTTTCAGAGCCATCTGATGTAGGAGGAAGTTATGAA
#000035230611N10F(0) [RC] 711 bases, 00000000 checksum. {720 10} <1 710>
TTAGACCCAGGAGAAG-CATAAAATTTTCAGAGCCATCTGATGTAGGAGGAAGTTATGAA

* Each contig is preceded by a header starting with ##, followed by the contig identifier, number of reads aligned to it, and the number of bases in the padded consensus. If generated by TIGR Assembler, these records also contain an 8-digit checksum, however most converters generate a blank checksum (it's not used by any code anyway).

* The contig sequence, listed after the "##" header, is padded with the gap character.

* Each read aligned to the consensus is preceded by a header starting with a single "#" character. Provided in parantheses, is the 0-based offset of the read in the consensus. Within the square brackets the string "RC" indicates the read was reverse complemented, a fact also indicated in the representation of the clear range within the braces ({720 10}). The clear range is 1-based with respect to the unpadded/ungapped read sequence. Note the low number is 10, meaning the first 9 bases (1-9) have been trimmed from the beginning (5' end) of the read. There may also be bases trimmed at the end of the read (3' end) beyond base 720, but this format does not record how many bases there are. Next comes the coordinates of the read along the ungapped 1-based consensus are provided within angle brackets (<1 710>). This header also contains a checksum (largely ignored) and information about the number of bases following it.

* After the read header, the aligned section of the read (the bases within the clear range alone) is provided in padded form, and in the correct orientation (complemented if necessary).

== SAM Conversion ==
The [http://samtools.sf.net SAM (Sequence Alignment/Map)] format is a generic format for storing large nucleotide sequence alignments, used in the 1000 genomes project and many others.

bank2contig is a basic converter from the AMOS assembly format into SAM format. It works from AMOS Banks (indexed binary format), and outputs the assembled reads with extended CIGAR strings compatible with the samtools library. At this time it does not convert mate or library information, but should be sufficient for analyzing & visualizing the read to contig alignments from a variety of assembly formats, including AMOS, Celera Assembler, phrap, velvet, etc.

The basic steps are:

1. Create AMOS AFG file: AMOScmp, Minimus, & velvet automatically create AFG files

# Or convert ACE File
$ toAmos -ace data.ace -o data.afg

# Or convert Celera Assembler
$ toAmos -frg data.frg -a data.asm -o data.afg

2. Create AMOS bank
$ bank-transact -m data.afg -b data.bnk -c

3. Create contig fasta & SAM alignment file
$ bank2fasta -i -b data.bnk > data.fa
$ bank2contig -i -s data.bnk > data.sam

5. Load with samtools and view alignments
$ samtools faidx data.fa # index the contig FASTA
$ samtools import data.fa.fai data.sam data.bam # SAM->BAM
$ samtools index data.bam # index BAM
$ samtools tview data.bam data.fa # view alignments

== DNPTrapper ==

[http://dnptrapper.sourceforge.net/ DNPTrapper] is an assembly editing and visualization tool specifically designed for manual analysis and finishing of repeated regions. It differs from previous tools by providing flexibility and an overview that greatly simplifies the finishing process, by allowing the user to view whole repeat regions at once and to edit assembly errors manually by drag and drop. The program implements and visualizes the results of a previously described statistical method that detects defined nucleotide positions (DNPs, representing single base differences between repeat units) in the presence of sequencing errors.

Usage:

bank2contig -T data.bnk > data.xml

== Simple Layout ==

The simple layout format (-S) is a simple tab deliminated file with the ids of the reads in the contig. The fields are:

1. contig id
2. contig status
3. read id
4. reverse complement flag (0/1)
5. read offset (0-based gapped offset)

ToAmos

2010-12-14T18:06:40Z

Dmb000006: The -i option is decribed in it's own section below

toAmos: converter from various types of inputs to AMOS messages

== Overview ==

toAmos is primarily designed for converting the output of an assembly program into the AMOS format so that it can be stored in an AMOS bank. toAmos can be used as a replacement for tarchive2amos however the latter is more flexible when converting from Trace Archive or simple .seq and .qual inputs.

== Synopsis ==

toAmos -o out_file
(-s fasta_reads (-q qual_file) (-gq good_qual) (-bq bad_qual))
(-c tigr_contig | -a celera_asm [-S][-utg] | -ta tigr_asm | -ace phrap_ace [-phd])
(-m bambus_mates | -x trace_xml | -f celera_frg [-acc])
(-arachne arachne_links | -scaff bambus_scaff)
(-i insert_file | -map dst_map)
(-pos pos_file)
(-id min_id)

toAmos reads the inputs specified on the command line and converts the information into AMOS message format. The following types of information can be provided to toAmos:

* Sequence and quality data (options -f, -s, -q, -gq, or -bq)
* Library and mate-pair data (options -m, -x, -f, -i, or -map)
* Contig data (options -c, -a, -ta, or -ace)
* Scaffold data (option -a)

== Options ==
{| class="somecssclass" border="1"
|-
| -o <out_file> || output filename ('-' for standard output)
|-
| -s <fasta_reads> || sequence data file in FASTA format (reads names ending in .1 or /1 are taken as mate pairs)
|-
| -q <qual_file> || sequence quality score file in QUAL format
|-
| -gq <bad_qual> || minimum quality score for high-quality bases (default: 30) - if no quality file provided bases within clear range are assigned this quality value
|-
| -bq <good_qual> || maximum quality score for low-quality bases (default: 10) - if no quality file provided bases outside the clear range are assigned this quality value (default 10)
|-
| -c <tigr_contig> || provide TIGR .contig file
|-
| -a <celera_asm> || use Celera Assembler .asm contig file (contig and scaffold information)
|-
| -S || include the surrogate unitigs in the .asm file as AMOS contigs
|-
| -utg || include all UTG unitig messages in the .asm file as AMOS contigs
|-
| -ta <tigr_asm> || contig file in TIGR Assembler format (.tasm)
|-
| -ace <phrap_ace> || contig file in Phred ACE format (can be accompanied by -q)
|-
| -phd || read the content of PHD file referenced in ACE files
|-
| -m <bambus_mates> || library and mate-pair information file in Bambus format
|-
| -x <trace_xml> || ancilliary data file (library, mate-pair, clear range) in Trace Archive XML format
|-
| -f <celera_frg> || library, mate-pair, sequence, quality, and clear range data file in Celera Assembler format
|-
| -acc || use accession numbers in FRG files
|-
| -arachne <arachne_links> || scaffold file in Arachne .links format
|-
| -scaff <bambus_scaff> || scaffold file in Bambus .scaff format
|-
| -map <dst_map> || read map information - mapping from internal library ID to external library ID useful in conjunction with the -f option. This file consists of space-separated records providing a mapping from the "acc:" field in "DST" records within the .frg file to an externally recognizable name for each library.
|-
| -pos <pos_file> || TIGR-style .pos position file
|-
| -id <min_id> || start numbering contigs at this number
|-
|}

== TIGR specific options (not too useful outside TIGR) ==

* -i <insert file> - use mapping from internal library ID to external library ID provided in a .insert file produced by pullfrag.

== Known issues ==

The -ta (TIGR Assembler input) and -ace (ACE formatted input) options have not been throughly tested and likely do not properly work. Contact us if either of these options is important to you.

== Errors ==
toAmos -c my.test.contig -m my.test.mates -o my.test.afg
Cannot find ID for sequence lid05.f

This problem is caused by...

ToAmos

2010-12-14T18:04:05Z

Dmb000006: /* Known issues */

toAmos: converter from various types of inputs to AMOS messages

== Overview ==

toAmos is primarily designed for converting the output of an assembly program into the AMOS format so that it can be stored in an AMOS bank. toAmos can be used as a replacement for tarchive2amos however the latter is more flexible when converting from Trace Archive or simple .seq and .qual inputs.

== Synopsis ==

toAmos -o out_file
(-s fasta_reads (-q qual_file) (-gq good_qual) (-bq bad_qual))
(-c tigr_contig | -a celera_asm [-S][-utg] | -ta tigr_asm | -ace phrap_ace [-phd])
(-m bambus_mates | -x trace_xml | -f celera_frg [-acc])
(-arachne arachne_links | -scaff bambus_scaff)
(-i insert_file | -map dst_map)
(-pos pos_file)
(-id min_id)

toAmos reads the inputs specified on the command line and converts the information into AMOS message format. The following types of information can be provided to toAmos:

* Sequence and quality data (options -f, -s, -q, -gq, or -bq)
* Library and mate-pair data (options -m, -x, -f, -i, or -map)
* Contig data (options -c, -a, -ta, or -ace)
* Scaffold data (option -a)

== Options ==
{| class="somecssclass" border="1"
|-
| -o <out_file> || output filename ('-' for standard output)
|-
| -s <fasta_reads> || sequence data file in FASTA format (reads names ending in .1 or /1 are taken as mate pairs)
|-
| -q <qual_file> || sequence quality score file in QUAL format
|-
| -gq <bad_qual> || minimum quality score for high-quality bases (default: 30) - if no quality file provided bases within clear range are assigned this quality value
|-
| -bq <good_qual> || maximum quality score for low-quality bases (default: 10) - if no quality file provided bases outside the clear range are assigned this quality value (default 10)
|-
| -c <tigr_contig> || provide TIGR .contig file
|-
| -a <celera_asm> || use Celera Assembler .asm contig file (contig and scaffold information)
|-
| -S || include the surrogate unitigs in the .asm file as AMOS contigs
|-
| -utg || include all UTG unitig messages in the .asm file as AMOS contigs
|-
| -ta <tigr_asm> || contig file in TIGR Assembler format (.tasm)
|-
| -ace <phrap_ace> || contig file in Phred ACE format (can be accompanied by -q)
|-
| -phd || read the content of PHD file referenced in ACE files
|-
| -m <bambus_mates> || library and mate-pair information file in Bambus format
|-
| -x <trace_xml> || ancilliary data file (library, mate-pair, clear range) in Trace Archive XML format
|-
| -f <celera_frg> || library, mate-pair, sequence, quality, and clear range data file in Celera Assembler format
|-
| -acc || use accession numbers in FRG files
|-
| -arachne <arachne_links> || scaffold file in Arachne .links format
|-
| -scaff <bambus_scaff> || scaffold file in Bambus .scaff format
|-
| -i <insert_file> || read insert information
|-
| -map <dst_map> || read map information - mapping from internal library ID to external library ID useful in conjunction with the -f option. This file consists of space-separated records providing a mapping from the "acc:" field in "DST" records within the .frg file to an externally recognizable name for each library.
|-
| -pos <pos_file> || TIGR-style .pos position file
|-
| -id <min_id> || start numbering contigs at this number
|-
|}

== TIGR specific options (not too useful outside TIGR) ==

* -i <insert file> - use mapping from internal library ID to external library ID provided in a .insert file produced by pullfrag.

== Known issues ==

The -ta (TIGR Assembler input) and -ace (ACE formatted input) options have not been throughly tested and likely do not properly work. Contact us if either of these options is important to you.

== Errors ==
toAmos -c my.test.contig -m my.test.mates -o my.test.afg
Cannot find ID for sequence lid05.f

This problem is caused by...

ToAmos

2010-12-14T18:02:47Z

Dmb000006: /* Options */

toAmos: converter from various types of inputs to AMOS messages

== Overview ==

toAmos is primarily designed for converting the output of an assembly program into the AMOS format so that it can be stored in an AMOS bank. toAmos can be used as a replacement for tarchive2amos however the latter is more flexible when converting from Trace Archive or simple .seq and .qual inputs.

== Synopsis ==

toAmos -o out_file
(-s fasta_reads (-q qual_file) (-gq good_qual) (-bq bad_qual))
(-c tigr_contig | -a celera_asm [-S][-utg] | -ta tigr_asm | -ace phrap_ace [-phd])
(-m bambus_mates | -x trace_xml | -f celera_frg [-acc])
(-arachne arachne_links | -scaff bambus_scaff)
(-i insert_file | -map dst_map)
(-pos pos_file)
(-id min_id)

toAmos reads the inputs specified on the command line and converts the information into AMOS message format. The following types of information can be provided to toAmos:

* Sequence and quality data (options -f, -s, -q, -gq, or -bq)
* Library and mate-pair data (options -m, -x, -f, -i, or -map)
* Contig data (options -c, -a, -ta, or -ace)
* Scaffold data (option -a)

== Options ==
{| class="somecssclass" border="1"
|-
| -o <out_file> || output filename ('-' for standard output)
|-
| -s <fasta_reads> || sequence data file in FASTA format (reads names ending in .1 or /1 are taken as mate pairs)
|-
| -q <qual_file> || sequence quality score file in QUAL format
|-
| -gq <bad_qual> || minimum quality score for high-quality bases (default: 30) - if no quality file provided bases within clear range are assigned this quality value
|-
| -bq <good_qual> || maximum quality score for low-quality bases (default: 10) - if no quality file provided bases outside the clear range are assigned this quality value (default 10)
|-
| -c <tigr_contig> || provide TIGR .contig file
|-
| -a <celera_asm> || use Celera Assembler .asm contig file (contig and scaffold information)
|-
| -S || include the surrogate unitigs in the .asm file as AMOS contigs
|-
| -utg || include all UTG unitig messages in the .asm file as AMOS contigs
|-
| -ta <tigr_asm> || contig file in TIGR Assembler format (.tasm)
|-
| -ace <phrap_ace> || contig file in Phred ACE format (can be accompanied by -q)
|-
| -phd || read the content of PHD file referenced in ACE files
|-
| -m <bambus_mates> || library and mate-pair information file in Bambus format
|-
| -x <trace_xml> || ancilliary data file (library, mate-pair, clear range) in Trace Archive XML format
|-
| -f <celera_frg> || library, mate-pair, sequence, quality, and clear range data file in Celera Assembler format
|-
| -acc || use accession numbers in FRG files
|-
| -arachne <arachne_links> || scaffold file in Arachne .links format
|-
| -scaff <bambus_scaff> || scaffold file in Bambus .scaff format
|-
| -i <insert_file> || read insert information
|-
| -map <dst_map> || read map information - mapping from internal library ID to external library ID useful in conjunction with the -f option. This file consists of space-separated records providing a mapping from the "acc:" field in "DST" records within the .frg file to an externally recognizable name for each library.
|-
| -pos <pos_file> || TIGR-style .pos position file
|-
| -id <min_id> || start numbering contigs at this number
|-
|}

== TIGR specific options (not too useful outside TIGR) ==

* -i <insert file> - use mapping from internal library ID to external library ID provided in a .insert file produced by pullfrag.

== Known issues ==

The -ta (TIGR Assembler input) and -ace (ACE formatted input) options have not been throughly tested and likely do not properly work. Contact us if either of these options is important to you.

ToAmos

2010-12-14T18:02:01Z

Dmb000006: /* Options */ Updating and tabulating. please check!

toAmos: converter from various types of inputs to AMOS messages

== Overview ==

toAmos is primarily designed for converting the output of an assembly program into the AMOS format so that it can be stored in an AMOS bank. toAmos can be used as a replacement for tarchive2amos however the latter is more flexible when converting from Trace Archive or simple .seq and .qual inputs.

== Synopsis ==

toAmos -o out_file
(-s fasta_reads (-q qual_file) (-gq good_qual) (-bq bad_qual))
(-c tigr_contig | -a celera_asm [-S][-utg] | -ta tigr_asm | -ace phrap_ace [-phd])
(-m bambus_mates | -x trace_xml | -f celera_frg [-acc])
(-arachne arachne_links | -scaff bambus_scaff)
(-i insert_file | -map dst_map)
(-pos pos_file)
(-id min_id)

toAmos reads the inputs specified on the command line and converts the information into AMOS message format. The following types of information can be provided to toAmos:

* Sequence and quality data (options -f, -s, -q, -gq, or -bq)
* Library and mate-pair data (options -m, -x, -f, -i, or -map)
* Contig data (options -c, -a, -ta, or -ace)
* Scaffold data (option -a)

== Options ==
{|
|-
| -o <out_file> || output filename ('-' for standard output)
|-
| -s <fasta_reads> || sequence data file in FASTA format (reads names ending in .1 or /1 are taken as mate pairs)
|-
| -q <qual_file> || sequence quality score file in QUAL format
|-
| -gq <bad_qual> || minimum quality score for high-quality bases (default: 30) - if no quality file provided bases within clear range are assigned this quality value
|-
| -bq <good_qual> || maximum quality score for low-quality bases (default: 10) - if no quality file provided bases outside the clear range are assigned this quality value (default 10)
|-
| -c <tigr_contig> || provide TIGR .contig file
|-
| -a <celera_asm> || use Celera Assembler .asm contig file (contig and scaffold information)
|-
| -S || include the surrogate unitigs in the .asm file as AMOS contigs
|-
| -utg || include all UTG unitig messages in the .asm file as AMOS contigs
|-
| -ta <tigr_asm> || contig file in TIGR Assembler format (.tasm)
|-
| -ace <phrap_ace> || contig file in Phred ACE format (can be accompanied by -q)
|-
| -phd || read the content of PHD file referenced in ACE files
|-
| -m <bambus_mates> || library and mate-pair information file in Bambus format
|-
| -x <trace_xml> || ancilliary data file (library, mate-pair, clear range) in Trace Archive XML format
|-
| -f <celera_frg> || library, mate-pair, sequence, quality, and clear range data file in Celera Assembler format
|-
| -acc || use accession numbers in FRG files
|-
| -arachne <arachne_links> || scaffold file in Arachne .links format
|-
| -scaff <bambus_scaff> || scaffold file in Bambus .scaff format
|-
| -i <insert_file> || read insert information
|-
| -map <dst_map> || read map information - mapping from internal library ID to external library ID useful in conjunction with the -f option. This file consists of space-separated records providing a mapping from the "acc:" field in "DST" records within the .frg file to an externally recognizable name for each library.
|-
| -pos <pos_file> || TIGR-style .pos position file
|-
| -id <min_id> || start numbering contigs at this number
|-
|}

== TIGR specific options (not too useful outside TIGR) ==

* -i <insert file> - use mapping from internal library ID to external library ID provided in a .insert file produced by pullfrag.

== Known issues ==

The -ta (TIGR Assembler input) and -ace (ACE formatted input) options have not been throughly tested and likely do not properly work. Contact us if either of these options is important to you.

ToAmos

2010-12-14T17:48:13Z

Dmb000006: /* Synopsis */ Updating to latest synopsis... hope that's OK...

Bambus Manual

2010-12-13T11:40:28Z

Dmb000006:

{| align="right"
| __TOC__
|}

The output of most shotgun sequence assembly programs (such as TIGR Assembler, phrap, or CAP3) consists in a set of un-related contigs, whose order and orientation along the chromosome is unknown. Scaffolding represents the task of ordering and orienting these contigs by using additional information about their relative placement. Traditionally such information was identified from the pairing of reads from the opposite ends of an insert in double-barelled shotgun experiments (see figure).

[[Image:scaffold image.gif]]

Besides clone mate information, homology data, physical maps, or gene synteny information can be used to derive relationships between contigs. BAMBUS is a program written to handle all such types of data in a generic fashion in order to build contig scaffolds. In order to account for the varied quality of linking information BAMBUS allows the user to specify a hierarchy that will consider the most reliable linking data first, then expand the scaffolds using less reliable data. This approach minimizes the effect of errors inherent to experimentally-derived linking information.

=== Algorithm overview ===

BAMBUS has two main modes of operation: hierarchical - when the links are considered in the order of their priorities, and standard - when all the links are considered at the same priority level. In the latter case, all links connecting two contigs are bundled together to form a "gap", or "edge" if you look at the problem from a graph theoretical point of view. Each link is checked for validity before being added to a bundle. Thus, a link is considered invalid due to a length constraint if it forces the two contigs to overlap. In other words, if we assume the largest possible size for the insert corresponding to the link, if the coordinates of the mate-pair within the contigs force the contigs to overlap, we discard the link assuming it is due to a misassembly (Note: this behaviour can be specified on a link-by-link basis). Links that pass this test are further checked for consistency. If they do not all agree in the relative orientations of the contigs they imply, a majority rule is used to retain only those links that agree both in orientation and length. Furthermore, the bundle, or edge, is retained if it contains at least two links (this parameter, redundancy, can actually be tuned by the user). After this step is complete, the resulting graph gets traversed twice, once to assign consistent orientations to all the contigs, and the second time to determine the order of the contigs along the chromosome. Note that inconsistencies in order are currently allowed since they can provide finishing teams with useful information. In a future version of BAMBUS we will add an option to allow the generation of unambiguous scaffolds as well. Currently you can generate such unambiguous scaffolds using the [[#Untangling scaffolds|untangle]] program.

== Installation ==

=== Prerequisites ===

* Install AMOS
* Perl 5.6 or later
* [http://www.cpan.org/ XML::Parser] perl module
* [http://www.cpan.org/ Config::IniFiles] perl module
* [http://www.research.att.com/sw/tools/graphviz/ GraphViz] package

=== Obtaining BAMBUS ===

BAMBUS 2.3 is available free of charge under the open-source Artistic License.

The Bambus source if freely available for download from the File Release Section of our SourceForge project page.

To receive information regarding new releases and developments, please subscribe to our moderated, low-traffic users' mailing list:

amos-users(at)lists(dot)sourceforge(dot)net

=== Actual installation ===

1. Find a directory where you want the software installed. For example /users/home/CoolUser

2. Place the tar file bambus-2.33.tar.gz in this directory

3. Unpack the distribution
% tar xvzf bambus-2.33.tar.gz

4. Go into the newly created directory
% cd bambus-2.33

5. Carefully read the documentation

6. Edit the file Makefile and change the value of BASEDIR to the correct installation path, for example:
BASEDIR = /users/home/CoolUser

7. Also check if the path for perl (in variable PERL) matches the one on your system. Several common options are:
PERL = /usr/local/bin/perl
PERL = /usr/bin/perl

8. Type 'gmake install' to have all the software installed. The executables will be in /users/home/CoolUser/bin/ and the documentation in /users/home/CoolUser/doc.

9. To get started you can try out the small test dataset provided in data/test_run by typing:

goBambus -c test.contig -m test.mates -o test-bambus

== Running BAMBUS ==

The input to Bambus consists in a set of links between contigs. These links can be inferred from the pairing of reads belonging to the same insert, from physical map data, or from alignment to another genome. To accomodate these types of sources of linking information, and to allow for further extensions, the input to Bambus is presented in a general purpose XML format.

The following sections describe how the XML file can be generated for each type of linking data. For more information see [[#.evidence.xml|.evidence.xml]].

Linking information is grouped into "libraries". All links within a library have similar parameters. In the case of mate-pair linking information the libraries match the standard definition. For other linking data, for example MUMmer links, the library grouping reflects specific characteristics of the linking data. For example all MUMmer links are grouped within a library called "MUMmer".

=== Command line options ===

Bambus consists in a collection of programs controlled by a script called goBambus. The execution is controlled by the following sets of parameters. To obtain a list of all parameters and basic help information you need to type:

goBambus -h
or
goBambus -help

==== Configuration options ====

* -C <conf_file> . This parameter specifies the configuration file used by Bambus. A configuration file is not required, however it is useful if you want to change the default parameters. See [[#The configuration file|the configuration file]] for a detailed description of the configuration information.

Whenever you run Bambus it will generate a default configuration file called default.conf. If you want to modify the parameters, you will need to rename this file and then edit it.

==== Input Options ====

* -a <asm_file> . Use a TIGR Assembler .asm file as source of linking information.
* -c <contig_file> . Use a GDE formatted .contig assembly file as source of linking information. The .contig files can be obtained either from TIGR Assembler output (by concatenating the contents of the .align directory) or by converting .ace files (created by phrap or Consed) using the [[ace2contig]] package.
* -x <xml_file>. Use additional XML formatted linking information. You can use the -x option multiple times. See [[#.evidence.xml|.evidence.xml]] for the XML file format.
* -mx <make_file>. Use a gmake compatible make file to specify how the additional XML information is generated. Each of the files created must end in ".xml". Moreover, only the final XML targets are allowed to end in ".xml" to prevent goBambus from getting confused. See [[#Using Makefiles|using Makefiles]] for a more detailed description on using makefiles.

==== Mate-pair Information ====

Assembly output (such as .asm and .contig files) does not generally contain any information about the pairing of reads coming from opposite ends of the same insert, nor about the membership of inserts to libraries. The following parameters specify how to obtain this mate-pair information.

* -m <mates_file> . The mates file allows you to specify both library information and mate pairing information, using a system based on Perl regular expressions. See [[#The .mates file|the .mates file]].
* -D <database> . At TIGR, instead of a mates file you can opt to use a database to obtain library and mate information. Note that the -m and -D options are complementary. The following three parameters specify database options:
* -S <server> . Select a specific database server
* -U <user> . Log in with the specified user name
* -P <passwd> . Log in with the specified password

==== Output Options ====

* -o <output_prefix> . All the output file names will be generated from the <output_prefix> prefix.

==== Repeat Screening ====

Misassembled repeats can confuse Bambus, therefore it makes sense to screen the known repeats that may be misassembled. Bambus can use two types of repeat screening information:

* -r . Screen against the repeats listed in the database (option -D must be provided)
* -r <repfile> . Screen against repeats described in [[#Repeat files|repeat files]].

==== Flow-control ====

* -start <num>
* -end <num>. The execution of Bambus consists in a set of steps. The user can choose to start or end at a specific step for example in the case when she modifies the configuration file and wants to re-run just the final steps of the scaffolders. This feature should be used with caution.

=== Typical use cases ===
==== Using assembler output ====

For convenience, Bambus can use the output of an assembly program and automatically generate the required linking information. Two types of information are required:

* the tiling of reads in the assembly
* information about the pairing of reads

The first type of information can be obtained from the .asm or .contig output formats of TIGR Assembler. For more information about these file formats please refer to the [http://www.jcvi.org/cms/publications/listing/abstract/article/tigr-assembler-a-new-tool-for-assembling-large-shotgun-sequencing-projects/ TIGR Assembler documentation]. Note that the .contig file may be stripped of all sequence data as the only information used is that contained in the lines starting with #. In case you are using an assembler other than TIGR Assembler you may need to use the ta2ace package to convert the output into the .contig format.

The second type of information can be obtained from a [[#The .mates file|.mates file]], or from the database in case you are at TIGR.

A typical command line for someone outside of TIGR is:

goBambus -c test.contig -m test.mates -o test

and the execution will generate the following files:

* [[#.stats file|test.stats]] - statistics on scaffolds and libraries
* [[#.details file|test.details]] - detailed information about each contig pair (also called "gap").
* [[#.dot file|test.dot]] - GraphViz formatted description of the contig linkage information.
* [[#.evidence.xml|test.evidence.xml]] - XML representation of all the linking evidence provided.
* [[#.out.xml|test.out.xml]] - XML representation of the scaffolds.
* [[#.ps file|test.lib]] - list of the codes associated with each input library. These codes are reported on the links in the output file.
* [[#.sum file|test.sum]] - one-line sumaries of all scaffolds.
* [[#.oo file|test.oo]] - order and orientation information for all the contigs.

Additionally, the program generates some "working" files that are currently not removed as they provide useful debugging information.

* test.detective.xml - linking information derived from assembler input alone. It's the same as test.evidence.xml unless additional XML files are provided in the input.
* test.inp - input to the core scaffolding engine.
* test.grommit.conf - configuration information for the scaffolding engine

To view the graphical output you need to convert it to Postscript with the command:

dot -Tps -o test.ps test.dot

Then you can view the postscript file with the command:

gv test.ps

At TIGR, instead of specifying a .mates file you can use the database:

goBambus -c test.contig -D gbx -U access -P access -o test

==== Getting more (or less) information from the output ====

You can modify the information you get in the output by directly calling the printScaff command. PrintScaff requires as inputs the .evidence.xml file, the .out.xml file and the .lib files produce by Bambus, the minimal invocation being:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib

Optionally you may specify the output prefix with option -o (just like in the Bambus invocation). PrintScaff has parameters that control what gets reported, and in what format. In the first category you have:

* -dot - produce a .dot file
* -detail - produce a .details file
* -oo - produce a [[#.oo file|.oo]] file listing all the contigs in each scaffold
* -sum - produce a [[#.sum file|.sum]] tab delimited list of scaffold stats (#contigs, size, and span)
* -f <fasta_file> - generate a pseudo-molecule for each scaffold using the contig sequences listed in <fasta_file>.
note that the contigs in the scaffold files are names "contig_<id>" while the contigs in the <fasta_file> must simply be called "<id>". Note that this option together with the default -merge option (see below) is meaningless unless you've [[#Untangling scaffolds|untangled]] the scaffold.
* -phys - create a .phys file listing all "gaps" spanned only by the specified libraries:

printScaff -e test.evidence.xml -s test.out.xml -l test.lib -phys MUMmer

will generate a file called test.phys that contains all contig pairs linked by nothing but MUMmer links.

In the second category you have:

* -page - produce .dot file formatted for printing on 8.5x11" paper
* -plot - produce .dot file formatted for printing on a plotter (36x48" paper)
* -unused - draw edges corresponding to unused links
* -merge - (default) when the -f option is given, produce a pseudo-molecule for each scaffold by adding 60 N characters between the contigs in the scaffold
* -nomerge - when the -f option is given, create a fasta file containing all contigs in each scaffold in the correct orientation

==== Adding additional XML linking information ====

You can easily add additional linking information as long as it's in a format similar to the [[#.evidence.xml file|.evidence.xml file]]. Assuming we have two such files "link1.xml" and "link2.xml" you can call Bambus as follows:

goBambus -c test.contig -m test.mates -x link1.xml -x link2.xml -o test

The result being the same as if you concatenated the XML file inferred from the .contig file together with the two additional XML files.

==== Using MUMmer links ====

In case you are sequencing a genome for which a close relative has been completed, you can use [[MUMmer]] to infer links between contigs. Assume you have the reference genome in a file "ref.fasta" and all the contigs from the genome you are assembling in a file called "test.fasta", you can run MUMmer to align the test genome to the reference:

nucmer -maxmatch ref.fasta test.fasta

The output will be placed in a file called out.delta. You can then convert the alignment information into an XML file suitable for Bambus as follows:

show-tiling -x out.delta > test.mum.xml

And finally you can provide this information to Bambus:

goBambus -c test.contig -m test.mates -x test.mum.xml -o test

==== Using Makefiles ====

If you have multiple XML files, it becomes tedious to generate each file separately and then provide them to Bambus with multiple -x options. You can create a Makefile compatible with GNU make that generates each of the XML files. You must be careful that the targets for all the XML files end in ".xml", and no other targets (such as intermediate targets) have the same suffix. Bambus will assume that all .xml files produced by the Makefile must be added to the input.

An example of a simple Makefile that can generate the mummer information described above is:

test.mum.xml: ref.fasta test.fasta
nucmer -a max-match ref.fasta test.fasta
show-tiling -x out.delta > test.mum.xml

==== Untangling scaffolds ====

By default, the scaffolds resulting from Bambus are potentially ambiguous as two or more contigs may occupy the same place in the genome. Such situations occur either due to misassembled repeats, or when assembling different haplotypes. Bambus contains a utility that does a best effort attempt to disambiguate the scaffolds, by breaking them into a set of non-ambiguous scaffolds. The algorithm is greedy and does not guarantee that an optimal solution (e.g. a minimum number of longest scaffolds) is produced.

To run the untangler type:

untangle -e test.evidence.xml -s test.out.xml -o test.untangle.xml

The test.untangle.xml file has the same format as the output file ([[#.out.xml|.out.xml]]) and for each scaffold (e.g. scaff_1) contains one or more unambiguous scaffolds called "scaff_1_1, scaff_1_2, etc". You can use the [[#Getting more (or less) information from the output|printScaff]] command to process the untangled file the same way you would process a normal Bambus output.

== File formats ==

=== The configuration file ===

Scaffolding parameters can be specified in a configuration file. Whenever you run Bambus a default configuration file is generated as default.conf. To create your own file you should rename this file then edit it. The following types of information can be specified:

* Comments. Lines starting with # are considered comments and are ignored by the software.

* Priority information. Specifies an order in which the libraries are considered. For example, the following line specifies priority 2 for all MUMmer links. The libraries are considered in the increasing order of priorities.

priority MUMmer 2

If no configuration file is provided, all libraries are assumed to have the same priority. If a configuration file is provided, only those libraries that have a priority record are used in scaffolding. In other words, if a library has no priority record associated with it, it will not be used in scaffolding.

A priority can also be specified for a specific redundancy parameter. For example, the following line specifies that those links between contigs supported by 4 or more sources of linking data should be processed at priority level 1.

priority redundancy 4 1

Note, however, that you need to also specify a set of libraries to be used at this priority level. A simple shortcut is specifying that all link classes should be processed at this redundancy.

priority ALL 1

If you are uncertain of which libraries your project uses, run Bambus without a configuration file. The libraries will be listed in a file ending in ".libs".

* Redundancy information. Specifies the minimum number of links required for a valid inter-contig link. You can specify this parameter as a global value:

redundancy 2

that is, all contigs must be linked by at least two links. You can also specify a per-library value like below which says a single MUMmer link is required to link two contigs.

redundancy MUMmer 1

* Minimum scaffold size. Specifies a scaffold size cutoff. For example thisimplies that only scaffolds that contain more than 10kbp are present in the output.:

mingroupsize 10000

* Link size error. For some link types, determining the exact link length can only be done within a certain error. This parameter specifies, as a percentage, the estimated error in size determination. For example, since MUMmer links make an assumption of evolutionary closeness between two genomes, the error associated with them should be proportional to the estimated evolutionary distance (as nucleotide % similarity) between the two genomes. For an error of 5% use this parameter:

error MUMmer 0.05

* Overlapping contigs allowed. For each library you can control the link validation process by specifying whether adjacent contigs are allowed to overlap or not. By default mate-pair links assume adjacent contigs do not overlap, while MUMmer links allow contigs to overlap. This is equivalent to the configuration parameters:

overlaps lib_1 N
overlaps MUMmer Y

An example is provided in [[bambus.conf|data/sample_files/bambus.conf]].

=== The .mates file ===

The .mates file provides two types of information: library data, and mate-pair relationships between reads.

Library data can be described in two formats:

library <name> <min_size> <max_size>

and

library <name> <min_size> <max_size> <regexp>

Both formats require you to name each library and to provide a size range for the inserts belonging to it. The second format allows you to also provide a Perl regular expression that describes the naming convention for reads belonging to the library. The part of the sequence name that represents the library name must be placed within parantheses. As an example, the regular expression for TIGR sequences (where the library is specified by the first 4 characters) is:

(....).*

Mate-pair relationships can also be described in two ways:

pair <regexp_forw> <regexp_rev>

or

<seq_forw> <seq_rev> <library_name>

The first format requires two regular expressions corresponding to the forward and reverse mates of an insert. Just like the library record, the portion of the name corresponding to the insert name must be placed within parantheses. Two reads that match the two regular expressions, and have the exact same section matched within the parantheses will be considered mates. The library will be determined from the regular expression associated with a "library" record. As an example, at TIGR the first 7 characters represent the insert, then are followed by an optional T or P and the primer name (F/R). The corresponding regular expression is:

pair (.......)[TP]?F (.......)[TP]?R

The second format is simply a list of sequence name pairs each followed by the name of the library they belong to. The corresponding library records must be listed in the file before the pairing data.

Note that fields in the .mates file must be separated by TAB characters otherwise the program will report an error.

An example of a .mates file using the phred/phrap naming convention is provided in [[bambus.mates|data/sample_files/bambus.mates]].

=== Repeat files ===

The repeat files used in screening linking data conform to the output standards of the repeatFinder program. The file contains 5 TAB-delimited values: contig ID, repeat name, left and right coordinates within the contig, and contig class. Only the first 4 values are essential as far as Bambus is concerned. Here is an example of repeat records:

AC009139.7.3 RPT1A 14554 96820 1
AC026498.3.2 RPT1B 124208 206475 1

=== .evidence.xml ===

This provides an XML representation of the input file. Below is a description of the data presented in this file (see also an [[bambus.evidence.xml|example]]):

The overall flow of the XML file is:

<EVIDENCE>
<LIBRARY>
<INSERT>
<SEQUENCE/>
<SEQUENCE/>
</INSERT>
...
</LIBRARY>
...
<CONTIG>
<SEQUENCE/>
...
</CONTIG>
...
<LINK>
<CONTIG/>
<CONTIG/>
</LINK>
...
</EVIDENCE>

<EVIDENCE this is the parent tag - the whole document occurs within <EVIDENCE> and </EVIDENCE>
ID = "1" some identifier for the file
DATE = "12/15/03" date when file was created
PROJECT = "MyProject" verbose description of the project
PARAMETERS = "" parameters used in creating this file
>

<LIBRARY for each shotgun library you need a separate library tag.
ID = "lib_1" some identifier for the library
NAME = "short" verbose name for the library
MIN = "1200" minimum size of inserts in this library
MAX = "3500" maximum size of inserts in this library
>

<INSERT the insert concept links together the two reads obtained from opposite ends of a clone insert
ID="ins_1" generic identifier for the insert
NAME="GALBZ92" verbose name for the insert
>

<SEQUENCE each insert contains two sequence reads obtained from to the opposite ends
ID="seq_1" generic identifier for the sequence
NAME="GALBZ92TF" verbose name for the sequence
>

<CONTIG Each contig in the assembly needs to be represented in the file
ID="contig_1" contig identifier
NAME= "1" contig name (usually the same as the identifier)
LEN="12352" contig length in basepairs
>

<SEQUENCE each sequence in the contig is listed together with information about the position within the contig.
ID="seq_1" sequence identifier (must match one described in the library section)
ORI="BE" orientation of sequence: BE - forward, EB - reverse
ASM_LEND="0" coordinate of sequence's left end within the contig
ASM_REND="525" coordinate of sequence's right end within the contig
>

<LINK this is a generic link between two contigs (i.e. not inferred from mate-pair data)
ID = "link_1" link identifier
SIZE = "-800" size of gap between the two contigs
TYPE = "MUMmer" link type: links with the same type get grouped into a virtual "library" used in specifying priorities
>

<CONTIG each link contains two contigs
ID = "contig_1" contig identifier: must match one described in the contig section above
ORI = "EB" contig orientation: BE - forward, EB - reverse
> each contig record may contain free-form data specifying the evidence for linking: e.g. alignment data

=== .out.xml ===

Represents, together with the evidence file, a description of the layout of the contigs. For each scaffold, the order (given as a coordinate along a chromosome) and the orientation of each contig is presented, together with a list of all the links used to generate this layout. Some links are deemed invalid, being given a code of "LEN" in case the length was deemed incorrect, or "ORI" in case the link orientation was considered incorrect. This file is also specified by the [[bambus.dtd|DTD]] and an example ([[bambus.out.xml|data/sample_files/bambus.out.xml]]).

=== .stats file ===

Finally, BAMBUS outputs a summary of the scaffolds generated. This file is pretty much self-explanatory, as evidenced by this example ([[bambus.stats|data/sample_files/bambus.stats]]). Note that in the .stats file, the N50 sizes are computed with respect to the total span of the scaffolds unless a different genome size is specified in a file called genome.size.

=== .details file ===

The .details file contains detailed information about the linking information between adjacent contigs. The contigs' orientation, size, and coordinates are listed and then all the linking data grouped by validity and library. An example is provided in [[bambus.details|data/sample_files/bambus.details]].

=== .dot file ===

Represents a graphical representation of the scaffolds in GraphViz format. Please see the [http://www.research.att.com/sw/tools/graphviz/ AT&T GraphViz] website for more information on this file format.

=== .ps file ===

A postscript image generated from the GraphViz-formatted file. It can be obtained from the .dot file with the command:
dot -Tps -o prefix.ps prefix.dot

An example is given in the picture below.
[[Image:bambus-display.jpg]]

Each scaffold is placed in a box, labeled with some statistics on the scaffold size (number of contigs, number of bases and span). All the "gaps" - linking relationships between the contigs are represented as edges. Each edge is decorated with the number of links contributing to it, the number of links from each library type, and the number of links invalidated due to incorrect length (L) or orientation (O). Each contig's ID, size, and coordinates within the scaffold are also listed.

=== .oo file ===

This file contains a summary of the order and orientation of all contigs present in the data. Each scaffold starts with a FASTA-like header containing the identifier for the scaffold followed by the number of contigs, size and span. Within each scaffold, all contigs are listed in the scaffold order, followed by the string BE for those in the forward orientation and EB for those in the reverse orientation. An example file is shown in: [[bambus.oo|data/sample_files/bambus.oo]].

=== .sum file ===

This file contains one line for each scaffold. Each line contains the scaffold ID, number of contigs, scaffold size, and span. The four values are separated by TAB characters. See an example in [[bambus.sum|data/sample_files/bambus.sum]].

== Known problems ==
There is a small "off-by-one" error in computing contig coordinates. It should not really affect the usefulness of the output.

== Contact information ==

BAMBUS is currently provided AS-IS, in other words we do not provide any support for the software. We would, however, like to hear your comments and suggestions. For Bambus bug reports, support requests, or any other inquiries please browse our SourceForge project page or Email us at:

amos-help (at) lists (dot) sourceforge (dot) net

BAMBUS was written by Mihai Pop and Dan Kosack.

Minimus2

2009-12-03T18:46:41Z

Dmb000006:

minimus2 is a modified version of the minimus pipeline designed for merging one or two sequence sets (S1,S2). It uses a nucmer based overlap detector which is much faster than the Smith-Waterman hash-overlap program used by minimus.

Usage:

minimus2 prefix \
-D REFCOUNT=n \ # Number of sequences is the first set
-D OVERLAP=n \ # Minimum overlap (Default 40bp)
-D CONSERR=f \ # Maximum consensus error (0..1) (Def 0.06)
-D MINID=n \ # Minimum overlap %id for align. (Def 94)
-D MAXTRIM=n # Maximum sequence trimming length (Def 20bp)

prefix is the base name of an [[AFG format]] file.

REFCOUNT should be the set to the number of sequences in the first set in order to align one set against the other (S1:S2). By default REFCOUNT=0 and an all vs all alignment is run (S1+S2:S1+S2 - same as minimus).
Example:
Let's say we have 2 sets (S1 & S2). There are 917 sequences in S1 and 1668 in S2.

grep -c "^>" S1.seq S2.seq
S1.seq:917
S2.seq:1668

The sets should be merged and converted to AMOS format:

cat S1.seq S2.seq > S1-S2.seq
toAmos -s S1-S2.seq -o S1-S2.afg

Then minimus2 should be run of the merged set:

minimus2 S1-S2 -D REFCOUNT=917

Input:

S1-S2.afg : AMOS message file that contains RED/FRG messages for all the reads in the two datasets.

Output:

S1-S2.fasta : contig sequences
S1-S2.singletons.seq : singleton sequences

Note: This pipeline has been introduced to the AMOS package starting with the release 2.0.8. If you have an older version of the AMOS package installed, it is highly recommended to upgrade it to the latest version

Alternatively, the new file could be manually downloaded and installed from the following location: [http://amos.cvs.sourceforge.net/*checkout*/amos/AMOS/src/Pipeline/minimus2.acf minimus2]

Figaro

2009-10-28T17:35:33Z

Dmb000006: Added a link to the download site. Hello again btw :-)

{| align="right"
| [[Image:FigaroLogo.png]]
|}

Figaro is a software tool for identifying and removing the vector from raw DNA sequence data without prior knowledge of the vector sequence. By statistically modeling short oligonucleotide frequencies within a set of reads, Figaro is able to determine which DNA words are most likely associated with vector sequence. For a description of Figaro's algorithms please see our [http://bioinformatics.oxfordjournals.org/cgi/content/full/24/4/462 paper]. You may download Figaro individually, or as part of the [http://sourceforge.net/project/showfiles.php?group_id=134326 AMOS package at SourceForge].

== Contributors. ==
* [http://www.cbcb.umd.edu/~whitej james robert white]
* michael roberts
* [http://www.cbcb.umd.edu/~mpop mihai pop]
* [http://yorke.umd.edu/ james yorke]

== Requirements ==

Figaro is released as C++ and Perl source code and should work on any Unix system. We strongly encourage users to quality trim their data as well using a program such as Lucy. Lucy can be downloaded [http://sourceforge.net/project/showfiles.php?group_id=134326 here].

== Documentation and Data ==
* [[Figaro User Manual]] - In depth description of how to run
* [[Figaro Simulated Data]] - Simulated data discussed in our paper.

== Keywords. ==
vector trimmer, vector clipping, vector trimming, open source, AMOS.

Minimus/README

2009-07-31T09:30:10Z

Dmb000006: /* Output */ Fixed a documentaion bug reported by Mark Miller

minimus - The AMOS Lightweight Assembler

== Brief Summary ==
minimus is an assembly pipeline designed specifically for small
data-sets, such as the set of reads covering a specific gene. Note that
the code will work for larger assemblies (we have used it to assemble
bacterial genomes), however, due to its stringency, the resulting assembly
will be highly fragmented. For large and/or complex assemblies the execution
of Minimus should be followed by additional processing steps, such as
scaffolding.

Minimus follows the Overlap-Layout-Consensus paradigm and consists of
three main modules:

* overlapper - computes the overlaps between the reads using a modified version of the Smith-Waterman local alignment algorithm

* tigger - uses the read overlaps to generate the layouts of reads representing individual contigs

* make-consensus - refines the layouts produced by the tigger to generate accurate multiple alignments within the reads

==Dependencies==
None.

==Running==
Either execute the minimus configuration script directly from
$bindir OR copy it to your local directory, edit it, and run it with
the `runAmos' command interpreter. The following variables must be set
on the command line or added to the script for the pipeline to operate
properly:

TGT - The target genome sequences in AMOS message format

`minimus -D TGT=<target> <prefix>'
OR
`runAmos -C minimus -D TGT=<target> <prefix>'

Where <prefix> will be the output file prefix, and <target> is the
input AMOS message file. Check the `runAmos' documentation or type
`runAmos --help' for details on operating an AMOS pipeline. The
minimus pipeline config file can be easily modified by the user to add
additional processing steps.

In order to run minimus you need to provide an AMOS formatted file
of the reads. Such a file (commonly with extension .afg) can be
generated from a combination of sequence (.seq), quality (.qual), and
Trace Archive XML (.xml) files using the `tarchive2amos' program which
will appear in the $bindir directory upon installation.

The default TGT file is <prefix>.afg, thus if our input file is
<prefix>.afg we can run minimus simply by typing:

`minimus <prefix>'

== Output ==
Output will be a TIGR .contig file and a FastA .fasta file. The
TIGR contig file contains the gapped consensus and multi-alignment
information for the assembly. Each contig sequence is preceded by a
header line which starts with '##', followed by the gapped consensus
sequence with gaps represented as a '-' character. Following the
consensus is the gapped read sequence preceded by a header line
beginning with '#'. The .fasta file contains all the contigs produced
by AMOScmp in a multi-FastA formatted file. These sequences will match
the sequences in the .contig file, but without the gaps.

To obtain an ACE format representation of the assembly, we can run
the following to obtain a <prefix>.ace file:

`bank-report -b <prefix>.bnk CTG > <prefix>.ctg'
`amos2ace <prefix>.afg <prefix>.ctg'

Where <prefix> is the same as was used in the above section and
<prefix>.afg is the original input to the assembly pipeline. We can
simply add these commands to the runAmos config file to produce an ACE
file every time we run minimus.

==Example==
Assume we have a set of Trace Archive data with the names
`target.seq', `target.qual' and `target.xml' which contain the
sequence information for a small assembly task. To run the minimus
pipeline and generate the default output, we would type the following:

`tarchive2amos -o target.seq'
`minimus -D TGT=target.afg target'

This will generate the default output named `target.contig' and
`target.fasta'. We could then generate an ACE assembly format file by
following the instructions in the above section, substituting "target"
for "<prefix>".

Minimus is now packaged with two example assemblies. The two examples
are an Influenza A assembly and a Zebra Fish Gene assembly under the 'test'
directory. The 'test' directory in located in the main AMOS directory after you untar
the AMOS tarball.

Minimus

2009-07-31T08:56:12Z

Dmb000006: /* Examples */ For me, having a simple example to quickly refer to while working at the cmd line is essential. I added a simple example here, along with a few questions!

== Overview ==

minimus is an assembly pipeline designed specifically for small data-sets, such as the set of reads covering a specific gene. Note that the code will work for larger assemblies (we have used it to assemble bacterial genomes), however, due to its stringency, the resulting assembly will be highly fragmented. For large and/or complex assemblies the execution of Minimus should be followed by additional processing steps, such as scaffolding.

minimus follows the Overlap-Layout-Consensus paradigm and consists of three main modules:

* [[hash-overlap]] - computes the overlaps between the reads using a modified version of the Smith-Waterman local alignment algorithm
* [[tigger]] - uses the read overlaps to generate the layouts of reads representing individual contigs
* [[make-consensus]] - refines the layouts produced by the tigger to generate accurate multiple alignments within the reads

minimus uses as AMOS messages as both the inputs and the outputs. Please see the [[File conversion utilities]] documentation for more information.

[[minimus2]] is a modified version of the minimus pipeline designed for merging two sequence sets. Instead of hash-overlap it uses a nucmer based overlap detector which is much faster.

== Documentation ==

Documentation on running minimus is included with the distribution in the /docs subdirectory.

See [[Minimus/README]].

== Examples ==

Examples of a flu assembly and a Zebrafish gene can be found in the test/minimus directory created when the AMOS distribution is untarred. Documentation on the examples is included with the distribution in /docs/minimus.README.

== Basic usage example ==

Assuming you have a set of reads in fasta format called '''my_reads.fasta''', and an associated set of read quality scores (in the same order?) called '''my_reads.qual''' (Note that in general, Amos expects Phred style quality scores ????).

toAmos \
-s my_reads.fasta \
-q my_reads.qual \
-o my_reads.afg

runAmos -C $AMOSBASE/src/Pipeline/minimus.acf my_reads

...

hawkeye my_reads.bnk/

== Publication ==

[http://www.biomedcentral.com/1471-2105/8/64 Minimus: a fast, lightweight genome assembler]

Sommer, DD, Delcher, AL, Salzberg, SL, and Pop, M. (2007) BMC Bioinformatics, 8:64doi:10.1186/1471-2105-8-64.

== Acknowledgements ==
The development of minimus was supported by the National Institutes of Health under grants R01-LM06845 and R01-LM007938 to SLS and by Department of Homeland Security cooperative agreement W81XWH-05-2-0051.

Minimus/README

2009-07-31T08:47:46Z

Dmb000006: /* Example */ Hey hey!

minimus - The AMOS Lightweight Assembler

== Brief Summary ==
minimus is an assembly pipeline designed specifically for small
data-sets, such as the set of reads covering a specific gene. Note that
the code will work for larger assemblies (we have used it to assemble
bacterial genomes), however, due to its stringency, the resulting assembly
will be highly fragmented. For large and/or complex assemblies the execution
of Minimus should be followed by additional processing steps, such as
scaffolding.

Minimus follows the Overlap-Layout-Consensus paradigm and consists of
three main modules:

* overlapper - computes the overlaps between the reads using a modified version of the Smith-Waterman local alignment algorithm

* tigger - uses the read overlaps to generate the layouts of reads representing individual contigs

* make-consensus - refines the layouts produced by the tigger to generate accurate multiple alignments within the reads

==Dependencies==
None.

==Running==
Either execute the minimus configuration script directly from
$bindir OR copy it to your local directory, edit it, and run it with
the `runAmos' command interpreter. The following variables must be set
on the command line or added to the script for the pipeline to operate
properly:

TGT - The target genome sequences in AMOS message format

`minimus -D TGT=<target> <prefix>'
OR
`runAmos -C minimus -D TGT=<target> <prefix>'

Where <prefix> will be the output file prefix, and <target> is the
input AMOS message file. Check the `runAmos' documentation or type
`runAmos --help' for details on operating an AMOS pipeline. The
minimus pipeline config file can be easily modified by the user to add
additional processing steps.

In order to run minimus you need to provide an AMOS formatted file
of the reads. Such a file (commonly with extension .afg) can be
generated from a combination of sequence (.seq), quality (.qual), and
Trace Archive XML (.xml) files using the `tarchive2amos' program which
will appear in the $bindir directory upon installation.

The default TGT file is <prefix>.afg, thus if our input file is
<prefix>.afg we can run minimus simply by typing:

`minimus <prefix>'

== Output ==
Output will be a TIGR .contig file and a FastA .fasta file. The
TIGR contig file contains the gapped consensus and multi-alignment
information for the assembly. Each contig sequence is preceded by a
header line which starts with '##', followed by the gapped consensus
sequence with gaps represented as a '-' character. Following the
consensus is the gapped read sequence preceded by a header line
beginning with '#'. The .fasta file contains all the contigs produced
by AMOScmp in a multi-FastA formatted file. These sequences will match
the sequences in the .contig file, but without the gaps.

To obtain an ACE format representation of the assembly, we can run
the following to obtain a <prefix>.ace file:

`bank-report -b <prefix>.bank CTG > <prefix>.ctg'
`amos2ace <prefix>.afg <prefix>.ctg'

Where <prefix> is the same as was used in the above section and
<prefix>.afg is the original input to the assembly pipeline. We can
simply add these commands to the runAmos config file to produce an ACE
file every time we run minimus.

==Example==
Assume we have a set of Trace Archive data with the names
`target.seq', `target.qual' and `target.xml' which contain the
sequence information for a small assembly task. To run the minimus
pipeline and generate the default output, we would type the following:

`tarchive2amos -o target.seq'
`minimus -D TGT=target.afg target'

This will generate the default output named `target.contig' and
`target.fasta'. We could then generate an ACE assembly format file by
following the instructions in the above section, substituting "target"
for "<prefix>".

Minimus is now packaged with two example assemblies. The two examples
are an Influenza A assembly and a Zebra Fish Gene assembly under the 'test'
directory. The 'test' directory in located in the main AMOS directory after you untar
the AMOS tarball.