#!/usr/bin/ruby -w
#
# Composition Profile
#
# Vladimir Vacic, University of California, Riverside
# Predrag Radivojac, Indiana University, Bloomington
# A. Keith Dunker, Indiana University School of Medicine, Indianapolis
# Stefano Lonardi, University of California, Riverside
#
# Nov-15-2006


require "./AminoAcid.rb"
require "./Fasta.rb"
require "./Util.rb"


def error(message)
    $stderr.print "ERROR: #{message}\n"

usage =<<END

Usage: cdiscover.rb -Q <query file> [options]
Looks for statistically significant composition differences between two sets.

Mandatory arguments:
  -Q <query file>

Optional arguments:
  -B <background file>
     or
  -D <known distribution>    One of the following:
                             disprot       Disordered regions from DisProt 3.4
                             pdbs25        PDB Select 25
                             sprot         Proteins from SwissProt
                             surface       Surface residues of monomers from PDB
                             Defaults to sprot.

  -A <alpha value>           Significance value for the statistical test.
                             Defaults to 0.05. 

  -b                         Bonferroni correction.
                             Off by default.

END
    $stderr.print usage
    exit
end


def init
    opts = Util.getopts('Q:B:D:A:b')

    # Check query sample file
    if nil!=opts['Q']
        error("Could not open query sample file #{opts['Q']}.") if !File.exist?(opts['Q'])
    else
        error("Query file not specified, terminating.")
    end

    if nil!=opts['B'] && nil!=opts['D']
        error("Both background file and background distribution selected.")
    end

    # Check background distribution file
    if nil!=opts['B'] && !File.exist?(opts['B'])
        error("Could not open background distribution file #{opts['B']}.")
    end

    if nil!=opts['D']
        if false==AminoAcid::COMPOSITION.keys.include?(opts['D'])        
            error("Unknown background distribution #{opts['D']}.")
        end
    else
        opts['D'] = 'sprot'
    end

    if nil==opts['A']
        opts['A'] = 0.05
    elsif opts['A'].to_f <= 0
        error("Alpha value #{opts['A']} has to be a positive number.")
    else
        opts['A'] = opts['A'].to_f
    end

    if nil!=opts['b']
        opts['A'] /= 40
    end

    return opts
end


#
# Utility function for determining and displaying significance.
#
def print_significance(property, pattern, alpha_value)
    query_pos = 0
    back_pos = 0

    for i in 0...pattern.length
        query_pos += $query_counts[pattern[i,1]]
        back_pos  += $back_counts[pattern[i,1]]
    end

    result = `./pvalue #{query_pos} #{$query_total} #{back_pos} #{$back_total}`.to_f

    print property.ljust(30)

    if result <= alpha_value
        if 1.0*query_pos/$query_total > 1.0*back_pos/$back_total
            print "Enriched.     "
        elsif 1.0*query_pos/$query_total < 1.0*back_pos/$back_total
            print "Depleted.     " 
        else
            print "No difference."
        end
        printf("   P-value=%6f (<=%6f)\n", result, alpha_value)
    else
        printf("Not significant. P-value=%6f (> %6f)\n", result, alpha_value)
    end
end


############################################################

opts = init()

#
# QUERY
#
$query_counts = Hash.new
AminoAcid::ALL_1.each  { |aa| $query_counts[aa]=0 }

f = Fasta.read(opts['Q'])
f.each do |s|
    for i in 0...s.sequence.length
        begin 
            $query_counts[s.sequence[i,1]] += 1
        rescue
            # $stderr.print "Skipping unknown amino acid symbol #{s.sequence[i,1]}.\n"
        end
    end
end 

$query_total = 0
AminoAcid::ALL_1.each  { |aa| $query_total += $query_counts[aa] }

#
# BACKGROUND
#
$back_counts = Hash.new

if nil==opts['B']
    $back_counts = AminoAcid::COUNTS[opts['D']] 
else
    AminoAcid::ALL_1.each  { |aa| $back_counts[aa]=0 }

    f = Fasta.read(opts['B'])
    f.each do |s|
        for i in 0...s.sequence.length
            begin 
                $back_counts[s.sequence[i,1]] += 1
            rescue
                # $stderr.print "Skipping unknown amino acid symbol #{s.sequence[i,1]}.\n"
            end
        end
    end 
end

$back_total = 0
AminoAcid::ALL_1.each  { |aa| $back_total += $back_counts[aa] }


#
# Individual amino-acids
#
AminoAcid::ALL_1.each do |aa|
    print_significance(AminoAcid::CODE[aa], aa, opts['A'])
end

puts
puts

#
# Amino-acids grouped by properties
#
AminoAcid::SPLIT_ORDER.each do |prop|
    print_significance(AminoAcid::SPLIT_NAME[prop], AminoAcid::SPLIT[prop], opts['A'])
end



