forked from wang-q/withncbi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_csv.pl
111 lines (90 loc) · 2.54 KB
/
merge_csv.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/perl
use strict;
use warnings;
use autodie;
use Getopt::Long;
use YAML::Syck;
use Path::Tiny;
#----------------------------------------------------------#
# GetOpt section
#----------------------------------------------------------#
=head1 NAME
merge_csv.pl - Merge csv files based on @fields
=head1 SYNOPSIS
cat input.csv another.csv | perl merge_csv.pl [options]
Options:
--help -? brief help message
--outfile -o STR output filename. Default is [stdout] for screen
--fields -f @INT fields as identifies. Default is [0], first column
--concat -c do concat other than merge. Keep first ID fields
cat 1.csv 2.csv | perl merge_csv.pl -f 0 -f 1
=cut
GetOptions(
'help|?' => sub { Getopt::Long::HelpMessage(0) },
'outfile|o=s' => \( my $outfile = 'stdout' ),
'fields|f=s' => \my @fields,
'concat|c' => \my $concat,
) or Getopt::Long::HelpMessage(1);
if ( !scalar @fields ) {
@fields = (0);
}
@fields = sort @fields; # make array splicing happier
#----------------------------------------------------------#
# apply
#----------------------------------------------------------#
my $index_of = {}; # index of ids in @lines
my @lines;
my ( $count_all, $index ) = ( 0, 0 );
while ( my $line = <> ) {
chomp $line;
next unless $line;
$count_all++;
my $id = join( "_", ( split ",", $line )[@fields] );
if ( exists $index_of->{$id} ) {
if ($concat) {
my $ori_index = $index_of->{$id};
my $ori_line = $lines[$ori_index];
my @fs = split ",", $line;
for my $f_idx ( reverse @fields ) {
splice @fs, $f_idx, 1;
}
$lines[$ori_index] = join ",", $ori_line, @fs;
}
}
else {
$index_of->{$id} = $index;
push @lines, $line;
$index++;
}
}
#----------------------------#
# check
#----------------------------#
{
my %seen;
for (@lines) {
my $number = scalar split(",");
$seen{$number}++;
}
if ( keys(%seen) > 1 ) {
warn "*** Fields not identical, be careful.\n";
warn YAML::Syck::Dump { fields => \%seen, };
}
}
#----------------------------#
# write outputs
#----------------------------#
my $out_fh;
if ( lc($outfile) eq "stdout" ) {
$out_fh = *STDOUT;
}
else {
open $out_fh, ">", $outfile;
}
for (@lines) {
print {$out_fh} $_ . "\n";
}
close $out_fh;
printf STDERR "Total lines [%d]; Result lines [%d].\n", $count_all, scalar @lines;
exit;
__END__