forked from zvelo/ngrams
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text2wfreq.cpp
132 lines (106 loc) · 3.38 KB
/
text2wfreq.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/*******************************************************************
C++ Package of Ternary Search Tree
Copyright (C) 2006 Zheyuan Yu
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
Read full GPL at http://www.gnu.org/copyleft/gpl.html
Email me at [email protected] if you have any question or comment
WebSite: http://www.cs.dal.ca/~zyu
*************************************************************************/
#include "text2wfreq.h"
#include <iostream>
#include <time.h>
using namespace std;
bool Text2wfreq::getOptions( int argc, char * argv[] )
{
if ( argc < 2 )
{
return false;
}
if ( Config::hasOption( "--help", argc, argv ) || Config::hasOption( "-help", argc, argv ) )
{
return false;
}
String value = Config::getOptionValue( "-type", argc, argv );
if ( value == "character" )
{
ngramType = Config::CHAR_NGRAM;
}
else if ( value == "word" )
{
ngramType = Config::WORD_NGRAM;
}
else if ( value != "" )
{
printf( "wrong type option!\n" );
return false;
}
value = Config::getOptionValue( "-n", argc, argv );
if ( value != "" )
{
sscanf( value.c_str(), "%d", &ngramN );
}
inFileName = Config::getOptionValue( "-in", argc, argv );
outFileName = Config::getOptionValue( "-out", argc, argv );
return true;
}
/**
* display help information
*/
void Text2wfreq::showHelp()
{
printf( "\nUsage: ngrams [options]\n" );
printf( "Compute all the word/char frequencies for the given text file.\n" );
printf( "Options:\n" );
printf( "--n=N Number of ngrams, the default is %d-grams.\n", Config::DEFAULT_NGRAM_N );
printf( "--type=T character or word, the default is %s.\n",(int) Config::DEFAULT_NGRAM_TYPE == (int)Config::WORD_NGRAM ? "word" : "character" );
printf( "--in=training files default to stdin.\n");
printf( "--out=output file default to stdout. ( currently stdout only )\n\n");
}
int main( int argc, char * argv[] )
{
time_t startTime;
time( &startTime );
Text2wfreq tf;
if ( tf.getOptions( argc, argv ) )
{
//tf.printOptions();
}
else
{
tf.showHelp();
return 0;
}
INgrams * ngrams;
if ( tf.getNgramType() == Config::WORD_NGRAM )
{ // word ngrams
ngrams = new WordNgrams( tf.getNgramN(), tf.getInFileName().c_str(), tf.getOutFileName().c_str() );
}
else // char ngrams
{
ngrams = new CharNgrams( tf.getNgramN(), tf.getInFileName().c_str(), tf.getOutFileName().c_str() );
}
time_t midTime;
time( &midTime );
fprintf( stderr, "ngrams have been generated, start outputing.\n" );
ngrams->output();
if ( ngrams )
{
delete ngrams;
ngrams = NULL;
}
time_t endTime;
time( &endTime );
fprintf( stderr, "\nSubtotal: %ld seconds for generating ngrams.\n", midTime-startTime );
fprintf( stderr, "Subtotal: %ld seconds for outputing ngrams.\n", endTime-midTime );
fprintf( stderr, "Total %ld seconds.\n", endTime-startTime );
}