diff --git a/parser/AMRGraph.py b/parser/AMRGraph.py index f2f5603..f2a4fb5 100644 --- a/parser/AMRGraph.py +++ b/parser/AMRGraph.py @@ -4,7 +4,7 @@ import random from collections import defaultdict -from parser.amr import AMR +from amr import AMR number_regexp = re.compile(r'^-?(\d)+(\.\d+)?$') abstract_regexp0 = re.compile(r'^([A-Z]+_)+\d+$') diff --git a/parser/parser.py b/parser/amr_parser.py similarity index 96% rename from parser/parser.py rename to parser/amr_parser.py index 448186e..059e071 100644 --- a/parser/parser.py +++ b/parser/amr_parser.py @@ -3,12 +3,12 @@ import torch.nn.functional as F import math -from parser.encoder import WordEncoder, ConceptEncoder -from parser.decoder import DecodeLayer -from parser.transformer import Transformer, SinusoidalPositionalEmbedding, SelfAttentionMask -from parser.data import ListsToTensor, ListsofStringToTensor, DUM, NIL, PAD -from parser.search import Hypothesis, Beam, search_by_batch -from parser.utils import move_to_device +from encoder import WordEncoder, ConceptEncoder +from decoder import DecodeLayer +from transformer import Transformer, SinusoidalPositionalEmbedding, SelfAttentionMask +from data import ListsToTensor, ListsofStringToTensor, DUM, NIL, PAD +from search import Hypothesis, Beam, search_by_batch +from utils import move_to_device class Parser(nn.Module): def __init__(self, vocabs, word_char_dim, word_dim, pos_dim, ner_dim, diff --git a/parser/data.py b/parser/data.py index 6614f49..aa76330 100644 --- a/parser/data.py +++ b/parser/data.py @@ -2,8 +2,8 @@ import torch from torch import nn import numpy as np -from parser.AMRGraph import AMRGraph -from parser.extract import read_file +from AMRGraph import AMRGraph +from extract import read_file PAD, UNK, DUM, NIL, END, CLS = '', '', '', '', '', '' GPU_SIZE = 12000 # okay for 8G memory @@ -14,7 +14,7 @@ def __init__(self, filename, min_occur_cnt, specials = None): self._priority = dict() num_tot_tokens = 0 num_vocab_tokens = 0 - for line in open(filename).readlines(): + for line in open(filename, encoding='utf8').readlines(): try: token, cnt = line.rstrip('\n').split('\t') cnt = int(cnt) diff --git a/parser/decoder.py b/parser/decoder.py index 23ad99a..135511c 100644 --- a/parser/decoder.py +++ b/parser/decoder.py @@ -2,11 +2,11 @@ from torch import nn import torch.nn.functional as F from torch.nn import Parameter -from parser.data import NIL, PAD -from parser.utils import compute_f_by_tensor -from parser.transformer import MultiheadAttention, Transformer, TiedTransformer +from data import NIL, PAD +from utils import compute_f_by_tensor +from transformer import MultiheadAttention, Transformer, TiedTransformer -from parser.utils import label_smoothed_nll_loss +from utils import label_smoothed_nll_loss class ArcGenerator(nn.Module): def __init__(self, vocabs, embed_dim, ff_embed_dim, num_heads, dropout): diff --git a/parser/encoder.py b/parser/encoder.py index 21d914d..188647b 100644 --- a/parser/encoder.py +++ b/parser/encoder.py @@ -4,7 +4,7 @@ from torch import nn import torch.nn.functional as F import re -from parser.transformer import Embedding +from transformer import Embedding def AMREmbedding(vocab, embedding_dim, pretrained_file=None, amr=False, dump_file=None): if pretrained_file is None: diff --git a/parser/extract.py b/parser/extract.py index cc3f24b..e344d1b 100644 --- a/parser/extract.py +++ b/parser/extract.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # coding: utf-8 +import os from collections import Counter import json, re -from parser.amr import AMR -from parser.AMRGraph import AMRGraph, number_regexp -from parser.AMRGraph import _is_abs_form +from amr import AMR +from AMRGraph import AMRGraph, number_regexp +from AMRGraph import _is_abs_form class AMRIO: def __init__(self): @@ -19,7 +20,7 @@ def read(file_path): tokens = amr_json['tokens'] lemmas = amr_json['lemmas'] pos_tags = amr_json['pos'] - ner_tags = ner_tags['ner'] + ner_tags = amr_json['ner'] myamr = AMRGraph.parse_json(line) yield tokens, lemmas, pos_tags, ner_tags, myamr @@ -81,7 +82,7 @@ def make_vocab(batch_seq, char_level=False): def write_vocab(vocab, path): - with open(path, 'w') as fo: + with open(os.path.join('vocab',path), 'w', encoding='utf8') as fo: for x, y in vocab.most_common(): fo.write('%s\t%d\n'%(x,y)) diff --git a/parser/postprocess.py b/parser/postprocess.py index 55a96f0..f613651 100644 --- a/parser/postprocess.py +++ b/parser/postprocess.py @@ -3,12 +3,11 @@ import networkx as nx import numpy as np -from parser.AMRGraph import is_attr_or_abs_form, need_an_instance +from AMRGraph import is_attr_or_abs_form, need_an_instance class PostProcessor(object): def __init__(self, rel_vocab): - self.amr = penman.AMRCodec() self.rel_vocab = rel_vocab def to_triple(self, res_concept, res_relation): @@ -60,7 +59,7 @@ def to_triple(self, res_concept, res_relation): return ret def get_string(self, x): - return self.amr.encode(penman.Graph(x), top=x[0][0]) + return penman.encode(penman.Graph(x), top=x[0][0]) def postprocess(self, concept, relation): mstr = self.get_string(self.to_triple(concept, relation)) diff --git a/parser/preprocess.py b/parser/preprocess.py index 988fbef..bd1fe13 100644 --- a/parser/preprocess.py +++ b/parser/preprocess.py @@ -5,7 +5,7 @@ def main(): - amr_file = r'C:\Users\austi\Desktop\Shared Task\mrp\2020\cf\training\amr.mrp' + amr_file = r'../../mrp/2020/cf/training/amr.mrp' nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,ner') diff --git a/parser/search.py b/parser/search.py index 20f4bc3..950f8ac 100644 --- a/parser/search.py +++ b/parser/search.py @@ -1,6 +1,6 @@ import torch -from parser.data import END, UNK -from parser.AMRGraph import is_attr_or_abs_form +from data import END, UNK +from AMRGraph import is_attr_or_abs_form """ Beam search by batch need model has two functions: diff --git a/parser/test_data.py b/parser/test_data.py index 82ea016..7b3511f 100644 --- a/parser/test_data.py +++ b/parser/test_data.py @@ -13,7 +13,7 @@ def main(): amrs = [] with open(amr_file, 'r', encoding='utf8') as f: for line in f: - amr = AMR.parse_json(line) + amr = json.loads(line) amrs.append(amr) drgs = [] diff --git a/parser/train.py b/parser/train.py index 9e4740d..9b54a36 100644 --- a/parser/train.py +++ b/parser/train.py @@ -3,74 +3,86 @@ import torch.multiprocessing as mp import argparse, os, random -from parser.data import Vocab, DataLoader, DUM, END, CLS, NIL -from parser.parser import Parser -from parser.work import show_progress -from parser.extract import LexicalMap -from parser.adam import AdamWeightDecayOptimizer -from parser.utils import move_to_device -from parser.bert_utils import BertEncoderTokenizer, BertEncoder -from parser.postprocess import PostProcessor -from parser.work import parse_data +from data import Vocab, DataLoader, DUM, END, CLS, NIL +from amr_parser import Parser +from work import show_progress +from extract import LexicalMap +from adam import AdamWeightDecayOptimizer +from utils import move_to_device +from bert_utils import BertEncoderTokenizer, BertEncoder +from postprocess import PostProcessor +from work import parse_data def parse_config(): parser = argparse.ArgumentParser() - parser.add_argument('--tok_vocab', type=str) - parser.add_argument('--lem_vocab', type=str) - parser.add_argument('--pos_vocab', type=str) - parser.add_argument('--ner_vocab', type=str) - parser.add_argument('--concept_vocab', type=str) - parser.add_argument('--predictable_concept_vocab', type=str) - parser.add_argument('--rel_vocab', type=str) - parser.add_argument('--word_char_vocab', type=str) - parser.add_argument('--concept_char_vocab', type=str) + # --lem_vocab ${dataset}/vocab/lem_vocab\ + # --pos_vocab ${dataset}/vocab/pos_vocab\ + # --ner_vocab ${dataset}/vocab/ner_vocab\ + # --concept_vocab ${dataset}/vocab/concept_vocab\ + # --predictable_concept_vocab ${dataset}/vocab/predictable_concept_vocab\ + # --rel_vocab ${dataset}/vocab/rel_vocab\ + # --word_char_vocab ${dataset}/vocab/word_char_vocab\ + # --concept_char_vocab ${dataset}/vocab/concept_char_vocab\ + # --train_data ${dataset}/amr.extended.mrp \ + # --dev_data ${dataset}/amr.extended.mrp \ + # --with_bert \ + # --bert_path ../bert-base-cased \ + # - + parser.add_argument('--tok_vocab', type=str, default='vocab/tok_vocab') + parser.add_argument('--lem_vocab', type=str, default='vocab/lem_vocab') + parser.add_argument('--pos_vocab', type=str, default='vocab/pos_vocab') + parser.add_argument('--ner_vocab', type=str, default='vocab/ner_vocab') + parser.add_argument('--concept_vocab', type=str, default='vocab/concept_vocab') + parser.add_argument('--predictable_concept_vocab', type=str, default='vocab/predictable_concept_vocab') + parser.add_argument('--rel_vocab', type=str, default='vocab/rel_vocab') + parser.add_argument('--word_char_vocab', type=str, default='vocab/word_char_vocab') + parser.add_argument('--concept_char_vocab', type=str, default='vocab/concept_char_vocab') parser.add_argument('--pretrained_file', type=str, default=None) parser.add_argument('--with_bert', dest='with_bert', action='store_true') parser.add_argument('--bert_path', type=str, default=None) - parser.add_argument('--word_char_dim', type=int) - parser.add_argument('--word_dim', type=int) - parser.add_argument('--pos_dim', type=int) - parser.add_argument('--ner_dim', type=int) - parser.add_argument('--concept_char_dim', type=int) - parser.add_argument('--concept_dim', type=int) - parser.add_argument('--rel_dim', type=int) + parser.add_argument('--word_char_dim', type=int, default=32) + parser.add_argument('--word_dim', type=int, default=300) + parser.add_argument('--pos_dim', type=int, default=32) + parser.add_argument('--ner_dim', type=int, default=16) + parser.add_argument('--concept_char_dim', type=int, default=32) + parser.add_argument('--concept_dim', type=int, default=300) + parser.add_argument('--rel_dim', type=int, default=100) - parser.add_argument('--cnn_filters', type=int, nargs = '+') - parser.add_argument('--char2word_dim', type=int) - parser.add_argument('--char2concept_dim', type=int) + parser.add_argument('--cnn_filters', type=int, nargs = '+', default=(3,256)) + parser.add_argument('--char2word_dim', type=int, default=128) + parser.add_argument('--char2concept_dim', type=int, default=128) + # --cnn_filter 3 256\ + parser.add_argument('--embed_dim', type=int, default=512) + parser.add_argument('--ff_embed_dim', type=int, default=1024) + parser.add_argument('--num_heads', type=int, default=8) + parser.add_argument('--snt_layers', type=int, default=4) + parser.add_argument('--graph_layers', type=int, default=2) + parser.add_argument('--inference_layers', type=int, default=4) - parser.add_argument('--embed_dim', type=int) - parser.add_argument('--ff_embed_dim', type=int) - parser.add_argument('--num_heads', type=int) - parser.add_argument('--snt_layers', type=int) - parser.add_argument('--graph_layers', type=int) - parser.add_argument('--inference_layers', type=int) + parser.add_argument('--dropout', type=float, default=0.2) + parser.add_argument('--unk_rate', type=float, default=0.33) - parser.add_argument('--dropout', type=float) - parser.add_argument('--unk_rate', type=float) - - parser.add_argument('--epochs', type=int) + parser.add_argument('--epochs', type=int, default=1000) parser.add_argument('--train_data', type=str) parser.add_argument('--dev_data', type=str) - parser.add_argument('--train_batch_size', type=int) - parser.add_argument('--batches_per_update', type=int) - parser.add_argument('--dev_batch_size', type=int) - parser.add_argument('--lr_scale', type=float) - parser.add_argument('--warmup_steps', type=int) + parser.add_argument('--train_batch_size', type=int, default=4444) + parser.add_argument('--batches_per_update', type=int, default=4) + parser.add_argument('--dev_batch_size', type=int, default=4444) + parser.add_argument('--lr_scale', type=float, default=1) + parser.add_argument('--warmup_steps', type=int, default=2000) parser.add_argument('--resume_ckpt', type=str, default=None) - parser.add_argument('--ckpt', type=str) - parser.add_argument('--print_every', type=int) - parser.add_argument('--eval_every', type=int) - + parser.add_argument('--ckpt', type=str, default='ckpt') + parser.add_argument('--print_every', type=int, default=100) + parser.add_argument('--eval_every', type=int, default=1000) - parser.add_argument('--world_size', type=int) - parser.add_argument('--gpus', type=int) - parser.add_argument('--MASTER_ADDR', type=str) - parser.add_argument('--MASTER_PORT', type=str) - parser.add_argument('--start_rank', type=int) + parser.add_argument('--world_size', type=int, default=1) + parser.add_argument('--gpus', type=int, default=1) + parser.add_argument('--MASTER_ADDR', type=str, default='localhost') + parser.add_argument('--MASTER_PORT', type=str, default='29505') + parser.add_argument('--start_rank', type=int, default=0) return parser.parse_args() @@ -124,10 +136,15 @@ def main(local_rank, args): p.requires_grad = False torch.manual_seed(19940117) - torch.cuda.manual_seed_all(19940117) random.seed(19940117) - torch.cuda.set_device(local_rank) - device = torch.device('cuda', local_rank) + use_gpu = True + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(19940117) + torch.cuda.set_device(local_rank) + device = torch.device('cuda', local_rank) + else: + device = torch.device('cpu') + use_gpu = False model = Parser(vocabs, args.word_char_dim, args.word_dim, args.pos_dim, args.ner_dim, @@ -143,7 +160,8 @@ def main(local_rank, args): torch.cuda.manual_seed_all(19940117 + dist.get_rank()) random.seed(19940117+dist.get_rank()) - model = model.cuda(local_rank) + if use_gpu: + model = model.cuda(local_rank) dev_data = DataLoader(vocabs, lexical_mapping, args.dev_data, args.dev_batch_size, for_train=False) pp = PostProcessor(vocabs['rel']) diff --git a/parser/work.py b/parser/work.py index 976696f..2a35210 100644 --- a/parser/work.py +++ b/parser/work.py @@ -1,12 +1,12 @@ import torch -from parser.data import Vocab, DataLoader, DUM, END, CLS, NIL -from parser.parser import Parser -from parser.postprocess import PostProcessor -from parser.extract import LexicalMap -from parser.utils import move_to_device -from parser.bert_utils import BertEncoderTokenizer, BertEncoder -from parser.match import match +from data import Vocab, DataLoader, DUM, END, CLS, NIL +from amr_parser import Parser +from postprocess import PostProcessor +from extract import LexicalMap +from utils import move_to_device +from bert_utils import BertEncoderTokenizer, BertEncoder +from match import match import argparse, os, re diff --git a/prepare.sh b/prepare.sh index a51b952..e23bee8 100755 --- a/prepare.sh +++ b/prepare.sh @@ -1,5 +1,5 @@ dataset=$1 -python3 -u -m parser.extract --train_data ${dataset}/train.txt.features.preproc +python3 -u -m parser.extract --train_data ${dataset} mv *_vocab ${dataset}/ # python3 encoder.py # cat ${dataset}/*embed | sort | uniq > ${dataset}/glove.embed.txt