Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .tt_skip
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ tools/rmap
tools/weightedaverage
tools/annotation_profiler
tools/megablast_xml_parser
tools/correlation
tools/merge_cols
tools/microsats_alignment_level
tools/bowtie_color_wrappers
Expand Down
49 changes: 31 additions & 18 deletions tools/correlation/cor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
"""

import sys
from rpy import *
import rpy2.robjects as robjects
r = robjects.r


def stop_err(msg):
sys.stderr.write(msg)
Expand All @@ -17,17 +19,25 @@ def main():
assert method in ( "pearson", "kendall", "spearman" )

try:
columns = map( int, sys.argv[3].split( ',' ) )
column_string = sys.argv[3]
columns = list()
for col in column_string.split(','):
if '-' in col:
s, e = col.split('-')
col = list(range(int(s), int(e) + 1))
columns.extend(col)
else:
columns.append(int(col))
except:
stop_err( "Problem determining columns, perhaps your query does not contain a column of numerical data." )

matrix = []
skipped_lines = 0
first_invalid_line = 0
invalid_value = ''
invalid_column = 0

for i, line in enumerate( file( sys.argv[1] ) ):
for i, line in enumerate( open( sys.argv[1] ) ):
valid = True
line = line.rstrip('\n\r')

Expand Down Expand Up @@ -60,29 +70,32 @@ def main():
first_invalid_line = i+1

if valid:
matrix.append( row )
matrix += row

if skipped_lines < i:
try:
out = open( sys.argv[2], "w" )
except:
stop_err( "Unable to open output file" )

# Run correlation
try:
value = r.cor( array( matrix ), use="pairwise.complete.obs", method=method )
except Exception, exc:
out.close()
stop_err("%s" %str( exc ))
for row in value:
print >> out, "\t".join( map( str, row ) )
out.close()
fv = robjects.FloatVector(matrix)
m = r['matrix'](fv, ncol=len(columns),byrow=True)
rslt_mat = r.cor(m, use="pairwise.complete.obs", method=method )
value = []
for ri in range(1, rslt_mat.nrow + 1):
row = []
for ci in range(1, rslt_mat.ncol + 1):
row.append(rslt_mat.rx(ri,ci)[0])
value.append(row)
except Exception as exc:
stop_err("%s" % str( exc ))

with open( sys.argv[2], "w" ) as out:
for row in value:
out.write("%s\n" % "\t".join( map( str, row ) ))

if skipped_lines > 0:
msg = "..Skipped %d lines starting with line #%d. " %( skipped_lines, first_invalid_line )
if invalid_value and invalid_column > 0:
msg += "Value '%s' in column %d is not numeric." % ( invalid_value, invalid_column )
print msg
print(msg)

if __name__ == "__main__":
main()
77 changes: 46 additions & 31 deletions tools/correlation/cor.xml
Original file line number Diff line number Diff line change
@@ -1,33 +1,48 @@
<tool id="cor2" name="Correlation" version="1.0.0">
<description>for numeric columns</description>
<requirements>
<requirement type="package" version="1.0.3">rpy</requirement>
</requirements>
<command interpreter="python">cor.py $input1 $out_file1 $numeric_columns $method</command>
<inputs>
<param format="tabular" name="input1" type="data" label="Dataset" help="Dataset missing? See TIP below"/>
<param name="numeric_columns" label="Numerical columns" type="data_column" numerical="True" multiple="True" data_ref="input1" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" />
<param name="method" type="select" label="Method">
<option value="pearson">Pearson</option>
<option value="kendall">Kendall rank</option>
<option value="spearman">Spearman rank</option>
</param>
</inputs>
<outputs>
<data format="txt" name="out_file1" />
</outputs>
<tests>
<!--
Test a tabular input with the first line being a comment without a # character to start
-->
<test>
<param name="input1" value="cor.tabular" />
<param name="numeric_columns" value="2,3" />
<param name="method" value="pearson" />
<output name="out_file1" file="cor_out.txt" />
</test>
</tests>
<help>
<tool id="cor2" name="Correlation" version="1.0.1">
<description>for numeric columns</description>
<requirements>
<requirement type="package" version="2.9.4">rpy2</requirement>
</requirements>
<command>
python '$__tool_directory__/cor.py'
'$input1'
'$out_file1'
$numeric_columns
$method
</command>
<inputs>
<param format="tabular" name="input1" type="data" label="Dataset" help="Dataset missing? See TIP below"/>
<param name="numeric_columns" label="Numerical columns" type="text" multiple="True"
data_ref="input1" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" />
<!--param name="numeric_columns" label="Numerical columns" type="data_column" numerical="True" multiple="True"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@guerler do you remember that we talk once about the data_column parameter and that it would be nice to allow an overwrite mode like in the workflow form. So a user could choose to insert numbers instead of choosing the fields.
I get this request now a lot in communities that do more statistics on large files (>30 columns).

This PR demonstrates a simple tool that can take 2-3 as an indicator of the columns instead of 2,3. Tests are passing, maybe you can use this to explore some UI options? The idea is to offer the user some more advances selections like 2,5,10-30,40 in an advanced mode next to the traditional data_column view.

data_ref="input1" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" /-->
<param name="method" type="select" label="Method">
<option value="pearson">Pearson</option>
<option value="kendall">Kendall rank</option>
<option value="spearman">Spearman rank</option>
</param>
</inputs>
<outputs>
<data format="tabular" name="out_file1" />
</outputs>
<tests>
<!--
Test a tabular input with the first line being a comment without a # character to start
-->
<test>
<param name="input1" value="cor.tabular" />
<param name="numeric_columns" value="2,3" />
<param name="method" value="pearson" />
<output name="out_file1" file="cor_out.txt" />
</test>
<test>
<param name="input1" value="cor.tabular" />
<param name="numeric_columns" value="2-3" />
<param name="method" value="pearson" />
<output name="out_file1" file="cor_out.txt" />
</test>
</tests>
<help>

.. class:: infomark

Expand Down Expand Up @@ -97,5 +112,5 @@ This tool computes the matrix of correlation coefficients between numeric column
0.730635686279 1.0

So the correlation for our twenty cases is .73, which is a fairly strong positive relationship.
</help>
</help>
</tool>
2 changes: 1 addition & 1 deletion tools/correlation/test-data/cor.tabular
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ Person Height Self Esteem
1 65 4.1
1 67 3.8
1 63 3.4
2 61 3.6
2 61 3.6
4 changes: 2 additions & 2 deletions tools/correlation/test-data/cor_out.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
1.0 0.730635686279
0.730635686279 1.0
1.0 0.7306356862792351
0.7306356862792351 1.0
6 changes: 0 additions & 6 deletions tools/correlation/tool_dependencies.xml

This file was deleted.