-
Notifications
You must be signed in to change notification settings - Fork 1
/
headall
executable file
·95 lines (73 loc) · 2.89 KB
/
headall
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/local/bin/perl
# Author: Jason Eisner, University of Pennsylvania
# Usage: headall [-l] [files ...]
#
# Filters parses that are in the format produced by headify.
# Ensures that all constituents have a head marked.
#
# Ordinarily this is done by discarding any parses that
# contain headless constituents. However, if -l is specified,
# we "fix" headless constituents by making the last child the head.
require("stamp.inc"); &stamp; # modify $0 and @INC, and print timestamp
$headlast = 1, shift(@ARGV) if $ARGV[0] eq "-l";
die "$0: bad command line flags" if @ARGV && $ARGV[0] =~ /^-./;
$token = "[^ \t\n()]+"; # anything but parens or whitespace can be a token
while (<>) { # for each sentence
chop;
s/^(\S+:[0-9]+:\t)?//, $location = $&;
unless (/^\#/) { # unless a comment
$tree = &constit; # eat a constit (sentence)
die "$0:$location more than one sentence on line ending $_" if $_;
$sent++;
if ($tree) {
$_ = $tree;
$goodsent++;
} else {
$_ = "# DISCARDED by $0: missing head"; # should probably improve this error message
}
}
print "$location$_\n";
}
if ($headlast) {
print STDERR "$0: $constit constituents (possibly trivial), had to assign default heads to $marks\n";
} else {
print STDERR "$0: $sent sentences in, $goodsent survived\n";
}
# -------------------------
# Reads in the next constit, and following whitespace, from the front of $_.
# Any constit may start with @.
#
# input: none
# output: bracketed version with heads fixed, or "" if we must discard sentence
# Discipline: each regexp that eats text is required to eat
# any following whitespace, too.
sub constit {
local($headmark, $tag, $subtree, @subtrees); # "tag" denotes input tag
local($foundhead, $badkid);
$constit++;
$headmark = "@" if s/^@//; # delete initial @ if any, but remember it;
s/^\(\s*// || die "$0:$location open paren expected to start $_"; # eat open paren
s/^($token)\s*//o || die "$0:$location no tag"; # eat tag
$tag = $1;
$foundhead = 0;
$badkid = 0;
if (/^@?\(/) { # if tag is followed by at least one subconstituent (possibly marked with @)
until (/^\)/) { # eat all the subconstits recursively and remember what they were
$subtree = &constit;
$foundhead = 1 if $subtree =~ /^@/;
$badkid = 1 if $subtree eq "";
push (@subtrees, $subtree);
}
if ($headlast && !$foundhead) {
$subtrees[$#subtrees] = "@" . $subtrees[$#subtrees];
$foundhead = 1;
$marks++;
}
} else { # if tag is followed by just a lexical item
s/^([@]$token)\s*//o || die "$0:$location no lex item, or lex item not marked as head";
$foundhead = 1;
@subtrees = ($1);
}
s/^\)\s*// || die "$0:$location close paren expected to start $_";
($badkid || !$foundhead) ? "" : "$headmark($tag @subtrees)";
}