-
Notifications
You must be signed in to change notification settings - Fork 1
/
makeFileForSubDirsAndReadmes.pl
135 lines (87 loc) · 3.95 KB
/
makeFileForSubDirsAndReadmes.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#! /usr/local/bin/perl
# script for taking mysql dump and generating the following:
#
# 1. CSV file of subdirectory name (author_year_PMID_#) and corresponding PCL file names
# 2. Readmes for each author_year_PMID_#
#
# 1. create a subdirectory of the author_year_PMID_#
# 2. cp the appropriate pcl files into the subdirectory
# 3. Add a readme file with the following info:
# at the top: citation, PMID, GEO ID, Full Description
# one line for each pcl file - filename, short description, #conditions, tags
# 4. compress readme and PCL files together
# 5. make a zip file with all the PCL and readme files
#########################################
use strict;
#use Getopt::Mixed;
use constant TEST => 0;
#Getopt::Mixed::init('i=s input>i', 'o=s output>o');
#Getopt::Mixed::getOptions();
my %fileDataHash;
my %pclDataHash;
#our $opt_i;
#our $opt_o;
my $dataDir = "/var/www/html/data/expression/";
die ("Need input filename and output directory\n") if (! $ARGV[0] || ! $ARGV[1]); #$opt_i || ! $opt_o);
my $opt_i = $ARGV[0];
my $opt_o = $ARGV[1];
my $fileForCMS = $opt_o."microarray_expression_file_for_CMS_PROD.txt";
die ("Can't open $opt_i, file doesn't exist\n") if (! -e "$opt_i");
open(INPUT, "$opt_i");
# make hashes of data #
while (<INPUT>) {
chomp;
my ($pmid, $pclFilename, $shortDesc, $fullDesc, $numConds, $auth, $allAuths, $title, $journal, $year, $tags) = split("\t", $_);
my @firstAuthArray = split(" ", $auth);
my $firstInit = pop @firstAuthArray; # remove first and middle initial
my $lastName = join("_", (@firstAuthArray));
my $subdir = $lastName."_".$year."_PMID_".$pmid;
$fileDataHash{$subdir}{'pmid'} = $pmid unless ($fileDataHash{$subdir}{'pmid'});
$fileDataHash{$subdir}{'desc'} = $fullDesc unless ($fileDataHash{$subdir}{'desc'});
$fileDataHash{$subdir}{'cit'} = $allAuths.". ".$year.". ".$title." ".$journal unless ($fileDataHash{$subdir}{'cit'});
push (@{$fileDataHash{$subdir}{'pcl_files'}}, $pclFilename);
$pclDataHash{$pclFilename}{'short_desc'} = $shortDesc;
$pclDataHash{$pclFilename}{'num_conds'} = $numConds;
$pclDataHash{$pclFilename}{'tags'} = $tags;
}
close INPUT;
# for each key in %fileDataHash, make a subdiretory and then cp pcl files over #
# make a readme for that directory #
# check to see if the directory exists or make it
open (CMSFILE, ">$fileForCMS");
my $readmeURL = "http://downloads.yeastgenome.org/expression/microarray/";
for my $dir (sort {lc($a) cmp lc($b)} keys %fileDataHash) { ## need to sort alphabetically
my $fullPath = $opt_o."/".$dir."/";
my $readMePath = $readmeURL.$dir."/README";
print CMSFILE "../expression/microarray/".$dir."\tDatasets (PCL file format) loaded into SPELL from ".$fileDataHash{$dir}{'cit'}."\t".$readMePath."\n";
unless (-d $fullPath) {
mkdir $fullPath or die ("Can't make subdirectory: $fullPath\n");
}
## make README FILE ##
my $readme = $fullPath."README";
## get GEO num ##
my ($geo, $set) = split("_", $fileDataHash{$dir}{'pcl_files'}[0], 2);
open (README, ">$readme");
print README "Citation: ".$fileDataHash{$dir}{'cit'}."\n\n";
print README "Full Description: ".$fileDataHash{$dir}{'desc'}."\n\n";
print README "PMID: ".$fileDataHash{$dir}{'pmid'}."\n";
if ($geo =~ /^GSE/) {
print README "GEO ID: ".$geo."\n\n";
} else {
print README "GEO ID: N/A\n\n";
}
print README join("\t", ('PCL filename', 'short description', '# conditions', 'tags'))."\n";
for my $filename (@{$fileDataHash{$dir}{'pcl_files'}}) {
my $origFile = $dataDir.$filename;
my $newFile = $fullPath.$filename;
system("cp", "$origFile", "$newFile") == 0 or die("Can't copy $origFile to $newFile: $?\n");
print README $filename."\t";
print README $pclDataHash{$filename}{'short_desc'}."\t";
print README $pclDataHash{$filename}{'num_conds'}."\t";
print README $pclDataHash{$filename}{'tags'}."\n";
}
die if (TEST > 0);
}
close CMSFILE;
###
1; ## to keep perl happy