Trying to add doc.sents to rel_component example's get_instances function #12680

yofayed · 2023-05-26T16:01:19Z

yofayed
May 26, 2023

Hello,
I am trying to improve the rel_component by using the doc.sents from the sentencizer or parser component. I tried to follow the instruction in the "predictions from preceding components" section in https://spacy.io/usage/training, by modifying my config file to include the parser in annotating_components as noted below, but I keep getting this error when I call doc.sents in the get_instances function:

File "C:\rel-model\scripts\rel_model.py", line 79, in get_instances
for sent in doc.sents:
File "spacy\tokens\doc.pyx", line 890, in sents
ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: nlp.add_pipe('sentencizer'). Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting doc[i].is_sent_start.

Here are my modifications to the config file
...
pipeline = ["tok2vec", "parser","relation_extractor"]
...
[components.parser]
source = "en_core_web_lg"
...
[training]
...
annotating_components = ["parser"]

What do I have to do to include the sents from the parser or sentencizer in training this rel_component model so that I can access them in the model's get_instances function?

Thank You!

Answered by shadeMe

May 31, 2023

I'd suggest doing the following:

Use replace_listeners for the sourced parser so that it gets its own copy of the tok2vec layer.
Remove tok2vec from the frozen and annotating components so that it's just used for relation_extractor.
Ensure that tok2vec has include_static_vectors = true so that it's using the vectors that have been included from en_core_web_lg.

View full answer

vinbo8 · 2023-05-30T08:53:23Z

vinbo8
May 30, 2023

Could you paste your full config?

2 replies

yofayed May 30, 2023
Author

Below is my config file. Do I have to modify the parser_data.py to include the "sent_starts"? Thanks

[paths]
train = null
dev = null
raw = null
init_tok2vec = null

[system]
seed = 342
gpu_allocator = null

[nlp]
lang = "en"
pipeline = ["tok2vec","parser","relation_extractor"]
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@Tokenizers":"spacy.Tokenizer.v1"}
batch_size = 1000

[components]

[components.tok2vec]
#factory = "tok2vec"
source = "en_core_web_lg"

[components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = "en_core_web_lg"
width = 96
depth = 2
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true

[components.parser]
source = "en_core_web_lg"

[components.relation_extractor]
factory = "relation_extractor"
threshold = 0.5

[components.relation_extractor.model]
@architectures = "rel_model.v1"

[components.relation_extractor.model.create_instance_tensor]
@architectures = "rel_instance_tensor.v1"

[components.relation_extractor.model.create_instance_tensor.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = 96

[components.relation_extractor.model.create_instance_tensor.pooling]
@layers = "reduce_mean.v1"

[components.relation_extractor.model.create_instance_tensor.get_instances]
@misc = "rel_instance_generator.v1"
max_length = 15

[components.relation_extractor.model.classification_layer]
@architectures = "rel_classification_layer.v1"
nI = null
nO = null

[initialize]
vocab_data = null
vectors = null
init_tok2vec = ${paths.init_tok2vec}
after_init = {"@callbacks":"sentinel_customize_tokenizer"}

[initialize.before_init]
@callbacks = "spacy.copy_from_base_model.v1"
tokenizer = "en_core_web_lg"
vocab = "en_core_web_lg"

[initialize.components]

[corpora]

[corpora.dev]
@readers = "Gold_ents_Corpus.v1"
file = ${paths.dev}

[corpora.train]
@readers = "Gold_ents_Corpus.v1"
file = ${paths.train}

[training]
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600000
max_epochs = 0
max_steps = 10000
eval_frequency = 500
frozen_components = ["tok2vec","parser"]
annotating_components = ["tok2vec","parser"]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
before_to_disk = null
logger = {"@Loggers":"spacy.ConsoleLogger.v1"}

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001

[training.score_weights]
rel_micro_p = 0.0
rel_micro_r = 0.0
rel_micro_f = 1.0

shadeMe May 31, 2023

I'd suggest doing the following:

Use replace_listeners for the sourced parser so that it gets its own copy of the tok2vec layer.
Remove tok2vec from the frozen and annotating components so that it's just used for relation_extractor.
Ensure that tok2vec has include_static_vectors = true so that it's using the vectors that have been included from en_core_web_lg.

Answer selected by yofayed

yofayed · 2023-05-31T23:42:31Z

yofayed
May 31, 2023
Author

Thanks for your response @shadeMe. I will try these suggestions and let you know how it turns out.
I added the tok2vec to the frozen list as an attempt to fix the following error that seems to only happen if I try to loop through the sents in the get_instences function.

ValueError: operands could not be broadcast together with shapes (1122,36) (150,36)

This happens in the "get_loss" function when calculating the gradient
gradient = scores - truths

0 replies

yofayed · 2023-06-01T16:36:03Z

yofayed
Jun 1, 2023
Author

Hello @shadeMe, I modified the config as below according to your suggestions, so the model will tain fine as long as I don't try to loop through the doc.sents in the get_instences function, otherwise I get a "ValueError: operands could not be broadcast together with shapes (200,36) (196,36)"

Config:

[paths]
train = null
dev = null
raw = null
init_tok2vec = null

[system]
seed = 342
gpu_allocator = null

[nlp]
lang = "en"
pipeline = ["tok2vec","parser","relation_extractor"]
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
#tokenizer = {"@Tokenizers":"spacy.Tokenizer.v1"}
batch_size = 1000

[components]

[components.tok2vec]
#factory = "tok2vec"
source = "en_core_web_lg"
#Ensure that tok2vec has include_static_vectors = true so that it's using the vectors that have been included from
include_static_vectors = true

[components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = "en_core_web_lg"
width = 96
depth = 2
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true

[components.parser]
source = "en_core_web_lg"
#Use replace_listeners for the sourced parser so that it gets its own copy of the tok2vec layer.
replace_listeners = ["model.tok2vec"]