diff --git a/Clarifying_questions/LitModel_trainer.py b/Clarifying_questions/LitModel_trainer.py new file mode 100644 index 0000000..e57edb0 --- /dev/null +++ b/Clarifying_questions/LitModel_trainer.py @@ -0,0 +1,123 @@ +import transformers +from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset +import pandas as pd +import numpy as np +import torch.nn.functional as F +import pytorch_lightning as pl +import torch +from pytorch_lightning.callbacks import ModelCheckpoint +import math +import random +import re +import argparse +import wandb +from rouge import Rouge +from nltk.translate.bleu_score import sentence_bleu +from bert_score import score +from Clarifying_questions.Mischallenaous import shift_tokens_right +import logging + +class LitModel(pl.LightningModule): + # Instantiate the model + def __init__(self, learning_rate, tokenizer, model, hparams): + super().__init__() + self.tokenizer = tokenizer + self.model = model + self.learning_rate = learning_rate + # self.freeze_encoder = freeze_encoder + # self.freeze_embeds_ = freeze_embeds + self.save_hyperparameters(hparams) + + #.get_encoder just converts the text to numbers + + if self.hparams.freeze_encoder: + checker = self.freeze_params(self.model.get_encoder()) + + if self.hparams.freeze_embeds: + checker = self.freeze_embeds() + + def freeze_embeds(self): + #just the positional embedddings + ''' freeze the positional embedding parameters of the model; adapted from finetune.py ''' + checker = self.freeze_params(self.model.model.shared) + for d in [self.model.model.encoder, self.model.model.decoder]: + checker = self.freeze_params(d.embed_positions) + checker = self.freeze_params(d.embed_tokens) + + # try: + # checker = self.freeze_params(self.model.model.shared) + # for d in [self.model.model.encoder, self.model.model.decoder]: + # checker = self.freeze_params(d.embed_positions) + # checker = self.freeze_params(d.embed_tokens) + # except AttributeError: + # checker = self.freeze_params(self.model.shared) + # for d in [self.model.encoder, self.model.decoder]: + # checker = self.freeze_params(d.embed_tokens) + + # Do a forward pass through the model + def forward(self, input_ids, **kwargs): + return self.model(input_ids, **kwargs) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate) + return optimizer + + def training_step(self, batch, batch_idx): + # Load the data into variables + src_ids, src_mask = batch[0], batch[1] + tgt_ids = batch[2] + # i have to ask this + # Shift the decoder tokens right (but NOT the tgt_ids) + decoder_input_ids = shift_tokens_right(tgt_ids, self.tokenizer.pad_token_id) + + # Run the model and get the logits + outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False) + lm_logits = outputs[0] + # Create the loss function + ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + # Calculate the loss on the un-shifted tokens + loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1)) + + return {'loss':loss} + + def validation_step(self, batch, batch_idx): + src_ids, src_mask = batch[0], batch[1] + tgt_ids = batch[2] + + decoder_input_ids = shift_tokens_right(tgt_ids, self.tokenizer.pad_token_id) + + # Run the model and get the logits + outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False) + lm_logits = outputs[0] + + ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + val_loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1)) + + wandb.log({"val_loss": val_loss}) + return {'loss': val_loss} + + # Method that generates text using the BartForConditionalGeneration's generate() method + def generate_text(self, text, eval_beams, early_stopping = True, max_len = 40): + ''' Function to generate text ''' + generated_ids = self.model.generate( + input_ids= text["input_ids"].to(torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")), + attention_mask=text["attention_mask"].to(torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")), + use_cache=True, + decoder_start_token_id = self.tokenizer.pad_token_id, + num_beams= eval_beams, + max_length = max_len, + early_stopping = early_stopping + ) + return [self.tokenizer.decode(w, skip_special_tokens=True, clean_up_tokenization_spaces=True) for w in generated_ids] + + def freeze_params(self,model): + ''' Function that takes a model as input (or part of a model) and freezes the layers for faster training + adapted from finetune.py ''' + for layer in model.parameters(): + layer.requires_grade = False + return 1 + + + + + \ No newline at end of file diff --git a/Clarifying_questions/Main_trainer.py b/Clarifying_questions/Main_trainer.py new file mode 100644 index 0000000..ea88c62 --- /dev/null +++ b/Clarifying_questions/Main_trainer.py @@ -0,0 +1,161 @@ +import transformers +from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset +import pandas as pd +import numpy as np +import torch.nn.functional as F +import pytorch_lightning as pl +import torch +from pytorch_lightning.callbacks import ModelCheckpoint +import math +import random +import re +import argparse +import wandb +from rouge import Rouge +from nltk.translate.bleu_score import sentence_bleu +from bert_score import score +from pytorch_lightning.callbacks import TQDMProgressBar +from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig, AutoTokenizer +from SummaryDataModule import SummaryDataModule +from LitModel_trainer import LitModel +import logging +from rouge import Rouge +from nltk.translate.bleu_score import sentence_bleu +from evaluate import load +torch.cuda.empty_cache() + +bertscore = load("bertscore") + +# model_name = "facebook/bart-large" +# model_path = "Models/facebook-bart-large.ckpt" +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +def generate_prediction(seed_line, model_): + # Put the model on eval mode + model_.to(device) + model_.eval() + + prompt_line_tokens = tokenizer(seed_line, max_length = 192, return_tensors = "pt", padding=True,truncation = True) + + line = model_.generate_text(prompt_line_tokens, eval_beams = 8) + + + return line + + +def run_tests(testfile,model_loaded): + df = pd.read_csv(testfile) + df[['predicted', 'Blue', 'Blue_1gram', 'Blue_2gram', 'Blue_3gram']] = '' + df[['rouge_1_r', 'rouge_1_p', 'rouge_1_f', 'rouge_2_r','rouge_2_p', 'rouge_2_f', 'rouge_l_r', 'rouge_l_p' ,'rouge_l_f']] = '' + df[['bert_p', 'bert_r', 'bert_f1']] = '' + for i in range(len(df)): + line = df.iloc[i]['source'] + question_pred = generate_prediction(seed_line = line, model_ = model_loaded) + question_true = df.iloc[i]['target'] + Blue_score = sentence_bleu([question_pred[0].split()], question_true.split()) + Blue_score_1n = sentence_bleu([question_pred[0].split()], question_true.split(), weights=(1, 0, 0, 0)) + Blue_score_2n = sentence_bleu([question_pred[0].split()], question_true.split(), weights=(0, 1, 0, 0)) + Blue_score_3n = sentence_bleu([question_pred[0].split()], question_true.split(), weights=(0, 0, 1, 0)) + + rouge = Rouge() + # logging.info(question_pred[0]) + # logging.info(question_true) + # print(question_pred[0]) + # print(question_true) + rouge_res = rouge.get_scores(question_pred[0], question_true) + + df.loc[i]['predicted'] = question_pred + df.loc[i]['Blue'] = Blue_score + df.loc[i]['Blue_1gram'] = Blue_score_1n + df.loc[i]['Blue_2gram'] = Blue_score_2n + df.loc[i]['Blue_3gram'] = Blue_score_3n + df.loc[i]['rouge_1_r'] = rouge_res[0]["rouge-1"]['r'] + df.loc[i]['rouge_1_p'] = rouge_res[0]["rouge-1"]['p'] + df.loc[i]['rouge_1_f'] = rouge_res[0]["rouge-1"]['f'] + df.loc[i]['rouge_2_r'] = rouge_res[0]["rouge-2"]['r'] + df.loc[i]['rouge_2_p'] = rouge_res[0]["rouge-2"]['p'] + df.loc[i]['rouge_2_f'] = rouge_res[0]["rouge-2"]['f'] + df.loc[i]['rouge_l_r'] = rouge_res[0]["rouge-l"]['r'] + df.loc[i]['rouge_l_p'] = rouge_res[0]["rouge-l"]['p'] + df.loc[i]['rouge_l_f'] = rouge_res[0]["rouge-l"]['f'] + print("iteration") + + output_file = "facebook_bart-large"+'_metrics.csv' + + results = bertscore.compute(predictions =df['target'].values.tolist(), references = df['predicted'].values.tolist(), lang="en", verbose=True) + df['bert_p'] = results['precision'] + df['bert_r'] = results['recall'] + df['bert_f1'] = results['f1'] + df.to_csv(output_file,index=False) + + +logging.basicConfig(filename = "logss.log", level =logging.INFO, filemode = "w") +base_dir = '' +wandb.init() +wandb.config = { +"learning_rate": 0.00002, +"epochs": 10, +"batch_size": 64 +} +logging.info("The run name on wandb is {}".format(wandb.run.name)) + +hparams = argparse.Namespace() +hparams.freeze_encoder = True +hparams.freeze_embeds = True +hparams.eval_beams = 4 + + +tokenizer = BartTokenizer.from_pretrained(model_name, add_prefix_space=True) +# tokenizer = BartTokenizer.from_pretrained(model_path) + + + +bart_model = BartForConditionalGeneration.from_pretrained(model_name) + + +summary_data = SummaryDataModule(tokenizer, base_dir + '6-non_generic_train_snippets_cluster.csv', + batch_size = 4) + +model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = bart_model, hparams = hparams) + + +checkpoint = ModelCheckpoint(dirpath=base_dir) + +trainer = pl.Trainer(gpus = 1, + max_epochs = 1, + min_epochs = 1, + auto_lr_find = True, + callbacks=[checkpoint,TQDMProgressBar(refresh_rate=100)]) + +trainer.fit(model, summary_data) + +trainer.save_checkpoint(base_dir + "./Models/facebook-bart-large_2.ckpt") + + + + +#run_tests("test.csv",model) +# run_tests("7-openAI-clustered.csv",model) + + + + + + + + + +line_pred = generate_prediction(seed_line = ["Samsung | Television , Smartphone, Soundbox , Computer , Vaccum ", + "Samsung | Stockmarket, CEO, Devices, Headquarter", + "mercedes cla class convertible | exterior , interior , engine , prices , competition", + "Selena Gomez | Age , Birthday , Albums , Livingplace", + "Weather | wind , temperature, precipitation, humidity , visibility | Weather is controlled by many factors, "], + model_ = model) + +print(line_pred) + + + + + + + diff --git a/Clarifying_questions/Main_trainer_T5.py b/Clarifying_questions/Main_trainer_T5.py new file mode 100644 index 0000000..a6943ec --- /dev/null +++ b/Clarifying_questions/Main_trainer_T5.py @@ -0,0 +1,123 @@ +import transformers +from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset +import pandas as pd +import numpy as np +import torch.nn.functional as F +import pytorch_lightning as pl +import torch +from pytorch_lightning.callbacks import ModelCheckpoint +import math +import random +import re +import argparse +import wandb +from rouge import Rouge +from nltk.translate.bleu_score import sentence_bleu +from bert_score import score +from pytorch_lightning.callbacks import TQDMProgressBar +from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig, T5ForConditionalGeneration, AutoTokenizer +import logging +from rouge import Rouge +from nltk.translate.bleu_score import sentence_bleu +from evaluate import load +from simpletransformers.t5 import T5Model, T5Args + + + +def make_dataset(input_file): + data_df = pd.read_csv(input_file) + data_df.rename(columns = {'source':'input_text'}, inplace = True) + data_df.rename(columns = {'target':'target_text'}, inplace = True) + data_df['prefix'] = 'generate question' + train_df, evaluation_df = np.split(data_df.sample(frac=1), [int(.7*len(data_df))]) + return (train_df,evaluation_df) + + +def generate_prediction(seed_line, model_): + seedline = ['generate question: {0}'.format(line) for line in seed_line] + print(seed_line[:10]) + line = model_.predict(seed_line) + + + + return line + + +def run_tests(testfile,model_loaded): + df = pd.read_csv(testfile) + df[['predicted', 'Blue', 'Blue_1gram', 'Blue_2gram', 'Blue_3gram']] = '' + df[['rouge_1_r', 'rouge_1_p', 'rouge_1_f', 'rouge_2_r','rouge_2_p', 'rouge_2_f', 'rouge_l_r', 'rouge_l_p' ,'rouge_l_f']] = '' + df[['bert_p', 'bert_r', 'bert_f1']] = '' + question_predictions = generate_prediction(seed_line = df["source"].values.tolist(), model_ = model_loaded) + for i in range(len(df)): + question_pred = question_predictions[i] + + question_true = df.iloc[i]['target'] + Blue_score = sentence_bleu([question_pred[0].split()], question_true.split()) + Blue_score_1n = sentence_bleu([question_pred[0].split()], question_true.split(), weights=(1, 0, 0, 0)) + Blue_score_2n = sentence_bleu([question_pred[0].split()], question_true.split(), weights=(0, 1, 0, 0)) + Blue_score_3n = sentence_bleu([question_pred[0].split()], question_true.split(), weights=(0, 0, 1, 0)) + + rouge = Rouge() + rouge_res = rouge.get_scores(question_pred[0], question_true) + + df.loc[i]['predicted'] = question_pred + df.loc[i]['Blue'] = Blue_score + df.loc[i]['Blue_1gram'] = Blue_score_1n + df.loc[i]['Blue_2gram'] = Blue_score_2n + df.loc[i]['Blue_3gram'] = Blue_score_3n + df.loc[i]['rouge_1_r'] = rouge_res[0]["rouge-1"]['r'] + df.loc[i]['rouge_1_p'] = rouge_res[0]["rouge-1"]['p'] + df.loc[i]['rouge_1_f'] = rouge_res[0]["rouge-1"]['f'] + df.loc[i]['rouge_2_r'] = rouge_res[0]["rouge-2"]['r'] + df.loc[i]['rouge_2_p'] = rouge_res[0]["rouge-2"]['p'] + df.loc[i]['rouge_2_f'] = rouge_res[0]["rouge-2"]['f'] + df.loc[i]['rouge_l_r'] = rouge_res[0]["rouge-l"]['r'] + df.loc[i]['rouge_l_p'] = rouge_res[0]["rouge-l"]['p'] + df.loc[i]['rouge_l_f'] = rouge_res[0]["rouge-l"]['f'] + print("iteration") + + output_file = 't5-base'+'_metrics.csv' + + results = bertscore.compute(predictions =df['target'].values.tolist(), references = df['predicted'].values.tolist(), lang="en", verbose=True) + df['bert_p'] = results['precision'] + df['bert_r'] = results['recall'] + df['bert_f1'] = results['f1'] + df.to_csv(output_file,index=False) + +def main(memory_limit): + logging.basicConfig(filename = "logss.log", level =logging.INFO, filemode = "w") + train_df,evaluation_df = make_dataset('6-non_generic_train_snippets_cluster.csv') + model_args = T5Args() + model_args.max_seq_length = 200 + model_args.train_batch_size = 8 + model_args.eval_batch_size = 8 + model_args.num_train_epochs = 1 + model_args.evaluate_during_training = True + model_args.evaluate_during_training_steps = 1000 + model_args.use_multiprocessing = True + model_args.save_eval_checkpoints = False + model_args.gradient_checkpointing=True + model_args.optimizer_class="AdamW" + model_args.overwrite_output_dir = True + model = T5Model("t5", "t5-base", args=model_args) + + model.train_model(train_df, eval_data=evaluation_df,output_dir='t5_base_1', show_running_loss=True) + # model = T5Model("t5", 't5_base/checkpoint-910-epoch-1') + run_tests("test.csv",model) + +if __name__ == "__main__": + torch.cuda.empty_cache() + bertscore = load("bertscore") + main(memory_limit=48000) + + + + + + + + + + + diff --git a/Clarifying_questions/Main_trainer_load_Faspect.py b/Clarifying_questions/Main_trainer_load_Faspect.py new file mode 100644 index 0000000..092030d --- /dev/null +++ b/Clarifying_questions/Main_trainer_load_Faspect.py @@ -0,0 +1,67 @@ +import transformers +from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset +import pandas as pd +import numpy as np +import torch.nn.functional as F +import pytorch_lightning as pl +import torch +from pytorch_lightning.callbacks import ModelCheckpoint +import math +import random +import re +import argparse +import wandb +from rouge import Rouge +from nltk.translate.bleu_score import sentence_bleu +from bert_score import score +from pytorch_lightning.callbacks import TQDMProgressBar +from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig, AutoTokenizer,AutoConfig +from Clarifying_questions.SummaryDataModule import SummaryDataModule +from Clarifying_questions.LitModel_trainer import LitModel +from huggingface_hub import hf_hub_download +import logging +from rouge import Rouge +from nltk.translate.bleu_score import sentence_bleu +from evaluate import load + + +torch.cuda.empty_cache() + +bertscore = load("bertscore") + +model_name = "facebook/bart-base" +model_path = "Models/facebook-bart-large.ckpt" + +class Clarifying_question: + + def __init__(self, model_path="umass/bart-base-mimics-question-generation"): + self.tokenizer = BartTokenizer.from_pretrained(model_name, add_prefix_space=True) + self.bart_model = BartForConditionalGeneration.from_pretrained(model_name) + self.hparams = argparse.Namespace() + self.hparams.freeze_encoder = True + self.hparams.freeze_embeds = True + self.hparams.eval_beams = 4 + # self.model_loaded = LitModel.load_from_checkpoint("Clarifying_questions/Models/facebook-bart-base.ckpt", learning_rate = 2e-5, tokenizer = self.tokenizer, model = self.bart_model, hparams = self.hparams) + self.model_path = hf_hub_download(repo_id="umass/bart-base-mimics-question-generation", filename="facebook-bart-base.ckpt") + self.model_loaded = LitModel.load_from_checkpoint(self.model_path, learning_rate = 2e-5, tokenizer = self.tokenizer, model = self.bart_model, hparams = self.hparams) + + def generate_prediction(self,seed_line): + self.model_loaded.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) + self.model_loaded.eval() + + prompt_line_tokens = self.tokenizer(seed_line, max_length = 192, return_tensors = "pt", truncation = True) + + line = self.model_loaded.generate_text(prompt_line_tokens, eval_beams = 8) + + return line + + + + + +model = Clarifying_question(model_path) +seed_line = "Samsung | Stockmarket, CEO, Devices, Headquarter" +line_pred = model.generate_prediction(seed_line = seed_line) + +print(seed_line) +print(line_pred) \ No newline at end of file diff --git a/Clarifying_questions/Mischallenaous.py b/Clarifying_questions/Mischallenaous.py new file mode 100644 index 0000000..5d0f2fb --- /dev/null +++ b/Clarifying_questions/Mischallenaous.py @@ -0,0 +1,30 @@ +import transformers +from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset +import pandas as pd +import numpy as np +import torch.nn.functional as F +import pytorch_lightning as pl +import torch +from pytorch_lightning.callbacks import ModelCheckpoint +import math +import random +import re +import argparse +import wandb +from rouge import Rouge +from nltk.translate.bleu_score import sentence_bleu +from bert_score import score +from pytorch_lightning.callbacks import TQDMProgressBar + +from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig +import logging + +def shift_tokens_right(input_ids, pad_token_id): + """ Shift input ids one token to the right, and wrap the last non pad token (usually ). + This is taken directly from modeling_bart.py + """ + prev_output_tokens = input_ids.clone() + index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1) + prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze() + prev_output_tokens[:, 1:] = input_ids[:, :-1] + return prev_output_tokens \ No newline at end of file diff --git a/Clarifying_questions/SummaryDataModule.py b/Clarifying_questions/SummaryDataModule.py new file mode 100644 index 0000000..56192b4 --- /dev/null +++ b/Clarifying_questions/SummaryDataModule.py @@ -0,0 +1,59 @@ +import transformers +from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset +import pandas as pd +import numpy as np +import torch.nn.functional as F +import pytorch_lightning as pl +import torch +from pytorch_lightning.callbacks import ModelCheckpoint +import math +import random +import re +import argparse +import wandb +from rouge import Rouge +from nltk.translate.bleu_score import sentence_bleu +from bert_score import score +import logging + +from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig +from Clarifying_questions.encode_sentences_noise_sentence import encode_sentences, noise_sentence +linebreak = "*"*100 + +class SummaryDataModule(pl.LightningDataModule): + def __init__(self, tokenizer, data_file, batch_size, num_examples = 20000): + super().__init__() + self.tokenizer = tokenizer + self.data_file = data_file + self.batch_size = batch_size + self.num_examples = num_examples + + # Loads and splits the data into training, validation and test sets with a 70/15/15 split + def prepare_data(self): + # self.data = pd.read_csv(self.data_file)[:self.num_examples] + self.data = pd.read_csv(self.data_file) + self.train, self.validate, self.test = np.split(self.data.sample(frac=1), [int(.7*len(self.data)), int(.85*len(self.data))]) + + + # encode the sentences using the tokenizer + def setup(self, stage): + # see the inputs and the detail + self.train = encode_sentences(self.tokenizer, self.train['source'], self.train['target']) + self.validate = encode_sentences(self.tokenizer, self.validate['source'], self.validate['target']) + self.test = encode_sentences(self.tokenizer, self.test['source'], self.test['target']) + + # Load the training, validation and test sets in Pytorch Dataset objects + def train_dataloader(self): + dataset = TensorDataset(self.train['input_ids'], self.train['attention_mask'], self.train['labels']) + train_data = DataLoader(dataset, sampler = RandomSampler(dataset), batch_size = self.batch_size) + return train_data + + def val_dataloader(self): + dataset = TensorDataset(self.validate['input_ids'], self.validate['attention_mask'], self.validate['labels']) + val_data = DataLoader(dataset, batch_size = self.batch_size) + return val_data + + def test_dataloader(self): + dataset = TensorDataset(self.test['input_ids'], self.test['attention_mask'], self.test['labels']) + test_data = DataLoader(dataset, batch_size = self.batch_size) + return test_data \ No newline at end of file diff --git a/Clarifying_questions/encode_sentences_noise_sentence.py b/Clarifying_questions/encode_sentences_noise_sentence.py new file mode 100644 index 0000000..b4a17ca --- /dev/null +++ b/Clarifying_questions/encode_sentences_noise_sentence.py @@ -0,0 +1,102 @@ +import transformers +from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset +import pandas as pd +import numpy as np +import torch.nn.functional as F +import pytorch_lightning as pl +import torch +from pytorch_lightning.callbacks import ModelCheckpoint +import math +import random +import re +import argparse +import wandb +from rouge import Rouge +from nltk.translate.bleu_score import sentence_bleu +from bert_score import score +from pytorch_lightning.callbacks import TQDMProgressBar +import logging + +from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig +def encode_sentences(tokenizer, source_sentences, target_sentences, max_length=32, pad_to_max_length=True, return_tensors="pt"): + ''' Function that tokenizes a sentence + Args: tokenizer - the BART tokenizer; source and target sentences are the source and target sentences + Returns: Dictionary with keys: input_ids, attention_mask, target_ids + ''' + + input_ids = [] + attention_masks = [] + target_ids = [] + tokenized_sentences = {} + + for sentence in source_sentences: + encoded_dict = tokenizer( + sentence, + max_length=max_length, + padding="max_length" if pad_to_max_length else None, + truncation=True, + return_tensors=return_tensors, + add_prefix_space = True + ) + + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + input_ids = torch.cat(input_ids, dim = 0) + attention_masks = torch.cat(attention_masks, dim = 0) + for sentence in target_sentences: + encoded_dict = tokenizer( + sentence, + max_length=max_length, + padding="max_length" if pad_to_max_length else None, + truncation=True, + return_tensors=return_tensors, + add_prefix_space = True + ) + # Shift the target ids to the right + # shifted_target_ids = shift_tokens_right(encoded_dict['input_ids'], tokenizer.pad_token_id) + target_ids.append(encoded_dict['input_ids']) + + target_ids = torch.cat(target_ids, dim = 0) + + + batch = { + "input_ids": input_ids, + "attention_mask": attention_masks, + "labels": target_ids, + } + + return batch + + +def noise_sentence(sentence_, percent_words, replacement_token = ""): + ''' + Function that noises a sentence by adding tokens + Args: sentence - the sentence to noise + percent_words - the percent of words to replace with tokens; the number is rounded up using math.ceil + Returns a noised sentence + ''' + # Create a list item and copy + sentence_ = sentence_.split(' ') + sentence = sentence_.copy() + + num_words = math.ceil(len(sentence) * percent_words) + + # Create an array of tokens to sample from; don't include the last word as an option because in the case of lyrics + # that word is often a rhyming word and plays an important role in song construction + sample_tokens = set(np.arange(0, np.maximum(1, len(sentence)-1))) + + words_to_noise = random.sample(sample_tokens, num_words) + + # Swap out words, but not full stops + for pos in words_to_noise: + if sentence[pos] != '.': + sentence[pos] = replacement_token + + # Remove redundant spaces + sentence = re.sub(r' {2,5}', ' ', ' '.join(sentence)) + + # Combine concurrent tokens into a single token; this just does two rounds of this; more could be done + sentence = re.sub(r' ', "", sentence) + sentence = re.sub(r' ', "", sentence) + return sentence \ No newline at end of file diff --git a/Clustering/clustering.py b/Clustering/clustering.py new file mode 100644 index 0000000..856168b --- /dev/null +++ b/Clustering/clustering.py @@ -0,0 +1,285 @@ +import logging +import math +import pickle +import json +import torch +import numpy as np +import random +import collections +from typing import List, Union +from sentence_transformers import SentenceTransformer, InputExample +from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator +from sentence_transformers.losses import MultipleNegativesRankingLoss +from sentence_transformers.models import Pooling, Transformer +from sentence_transformers.util import dot_score +from torch.utils.data import DataLoader +from sklearn.cluster import DBSCAN, KMeans +from symbol import return_stmt +import wandb +import os +import numpy as np +os.environ["NCCL_DEBUG"] = "INFO" +from collections import defaultdict +from torch.cuda.amp import GradScaler, autocast +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm +from transformers import AdamW, get_linear_schedule_with_warmup +from facet_ranker import FacetRanker +from utils_ranking_model import PairwiseRankingDataset, RankingDataset, collate_batch, PairwiseRankingDatasetRandom, PointwiseRankingDataset +from utils_ranking_model import convert_example_to_features,collate_batch +from matplotlib import pyplot as plt +from scipy.special import softmax + + +model_path = 'weights_microsoft/mpnet-base' +line_break = '*'*50 + + + +class Clustering: + + def __init__(self, model_path=None): + self.model = SentenceTransformer(model_path) + self.model_type = "distilroberta-base" + self.ranking_model = FacetRanker(model_path="weights_ranker_5", + model_type=self.model_type, + use_gpu=True, + parallel=True, + max_seq_length=512) + self.tokenizer = self.ranking_model.tokenizer + + + def generate_the_training_data(self,input_file): + queryMap = {} + with open(input_file, encoding="utf-8") as f: + for query_id, line in enumerate(f): + d = json.loads(line.rstrip("\n")) + queryMap[d['query']] = d['snippets'] + return queryMap + + def generate_the_training_data_2(self,input_file): + queryMap = {} + with open(input_file, encoding="utf-8") as f: + for query_id, line in enumerate(f): + d = json.loads(line.rstrip("\n")) + queryMap[d['query']] = {} + queryMap[d['query']]['snippets'] = d['snippets'] + queryMap[d['query']]['groundtruth'] = d['groundtruth'] + queryMap[d['query']]['negatives'] = d['negatives'] + return queryMap + + + def extract_facets(self,facet_extractor,query,documents): + facets = facet_extractor.extract_facets(query, + documents, + aggregation="round-robin", + mmr_lambda=0.5, + classification_threshold=0.05, + classification_topk=0) + return facets + + + def cluster_facets(self,facets_embeddings): + clustering = DBSCAN(eps=0.485,min_samples=1).fit(facets_embeddings) + return clustering.labels_ + + def get_average_scores_positives(self,input_file): + total_score = 0 + total_snippet = 0 + minscore = 300000 + max_score = -300000 + i = 0 + with open(input_file, encoding="utf-8") as f: + for query_id, line in enumerate(f): + d = json.loads(line.rstrip("\n")) + logging.info(line_break) + scores_ = self.ranking_cluster(d['query'],d['snippets'],d['groundtruth']) + for j, true in enumerate(d['groundtruth']): + logging.info(true+" {}".format(scores_[j])) + logging.info(line_break) + scores_ = self.ranking_cluster(d['query'],d['snippets'],d['negatives']) + for j, true in enumerate(d['negatives']): + logging.info(true+" {}".format(scores_[j])) + + for each in scores_: + total_score+=each + total_snippet+=1 + if(eachmax_score): + max_score = each + i+=1 + if(i>10): + break + logging.info("the average score is {}".format(total_score/total_snippet)) + logging.info("the min score is {}".format(minscore)) + logging.info("the max score is {}".format(max_score)) + + + def batch_generator(self,query,snippets,facet_list): + batch = [] + for facet in facet_list: + text = "{} || {} || {}".format(query,facet,snippets) + features = convert_example_to_features( + text, + 512, + self.tokenizer, + cls_token_at_end=bool(self.model_type in ["xlnet"]), + cls_token=self.tokenizer.cls_token, + cls_token_segment_id=2 if self.model_type in ["xlnet"] else 0, + sep_token=self.tokenizer.sep_token, + sep_token_extra=False, + pad_on_left=bool(self.tokenizer.padding_side == "left"), + pad_token=self.tokenizer.pad_token_id, + pad_token_segment_id=self.tokenizer.pad_token_type_id) + batch.append(features) + return collate_batch(batch, all_features=True) + + # def precision(self,facet_extractor,input_file): + def ranking_cluster_histogram(self,facet_extractor,input_file): + query_snippets = self.generate_the_training_data_2(input_file) + positives_scores = np.empty(0) + negatives_scores = np.empty(0) + for query in query_snippets: + #for positives + snippets = query_snippets[query]['snippets'] + groundtruth_facet_list = query_snippets[query]['groundtruth'] + data = self.batch_generator(query,snippets,groundtruth_facet_list) + input_ids, attention_mask = data + scores_ = self.ranking_model.forward(input_ids, attention_mask).data.cpu().numpy().flatten() + positives_scores = np.concatenate((positives_scores,1/(1 + np.exp(-scores_)))) + negatives_facet_list = query_snippets[query]['negatives'] + data = self.batch_generator(query,snippets,negatives_facet_list) + input_ids, attention_mask = data + scores_ = self.ranking_model.forward(input_ids, attention_mask).data.cpu().numpy().flatten() + + negatives_scores = np.concatenate((negatives_scores,1/(1 + np.exp(-scores_)))) + + bins = np.linspace(-10, 10, 100) + plt.hist(positives_scores, bins = 100, alpha=0.5, label='groundtruth') + plt.hist(negatives_scores, bins = 100, alpha=0.5, label='negatives') + plt.legend(loc='upper right') + plt.show() + plt.savefig("graph.png") + + + def merge(self,list1, list2,type_facet=True,checker=0.9): + tuples = [] + for i in range(0,len(list1)): + if(list2[i]>=0.9): + tuples.append((list1[i],list2[i],type_facet)) + + return tuples + + def precision(self,facet_extractor,input_file): + average_precision = 0 + query_snippets = self.generate_the_training_data_2(input_file) + for query in query_snippets: + snippets = query_snippets[query]['snippets'] + groundtruth_facet_list = query_snippets[query]['groundtruth'] + data = self.batch_generator(query,snippets,groundtruth_facet_list) + input_ids, attention_mask = data + scores_ = self.ranking_model.forward(input_ids, attention_mask).data.cpu().numpy().flatten() + scores_ = 1/(1 + np.exp(-scores_)) + positive_tuples = self.merge(groundtruth_facet_list,scores_,True) + negatives_facet_list = query_snippets[query]['negatives'] + data = self.batch_generator(query,snippets,negatives_facet_list) + input_ids, attention_mask = data + scores_ = self.ranking_model.forward(input_ids, attention_mask).data.cpu().numpy().flatten() + scores_ = 1/(1 + np.exp(-scores_)) + negative_tuples = self.merge(negatives_facet_list,scores_,False) + facets_tuple = positive_tuples+negative_tuples + sorted_tuples = sorted(facets_tuple, key=lambda x: x[1], reverse=True) + # logging.info(sorted_tuples) + total=min(5,len(sorted_tuples)) + relevant = 0 + for i in range(min(5,len(sorted_tuples))): + if(sorted_tuples[i][2]==True): + relevant+=1 + # logging.info(relevant/total) + average_precision+=(relevant/total) + + def ranking_cluster_threshold(self,query,snippets,facet_list): + batch = [] + data = self.batch_generator(query,snippets,facet_list) + input_ids, attention_mask = data + scores_ = self.ranking_model.forward(input_ids, attention_mask).data.cpu().numpy() + scores_ = 1/(1 + np.exp(-scores_)) + relevant_facets = [] + for i, facet in enumerate(facet_list): + if(scores_[i]>=0.9): + relevant_facets.append(facet) + return relevant_facets + + + def ranking_cluster_partition(self,query,snippets,facet_list): + batch = [] + data = self.batch_generator(query,snippets,facet_list) + input_ids, attention_mask = data + scores_ = self.ranking_model.forward(input_ids, attention_mask).data.cpu().numpy() + + kmeans = KMeans(n_clusters=2) + kmeans.fit(scores_) + # this is finding the dominant cluster + cluster_scores=[[],[]] + for i, score in enumerate(scores_): + cluster_scores[kmeans.labels_[i]].append(score) + relevant_cluster = 0 if (sum(cluster_scores[0])/len(cluster_scores[0])) > (sum(cluster_scores[1])/len(cluster_scores[1])) else 1 + + relevant_facets = [] + for i, facet in enumerate(facet_list): + if(kmeans.labels_[i]==relevant_cluster): + relevant_facets.append(facet) + logging.info(relevant_facets) + logging.info(facet_list) + return relevant_facets + + + + def run_clustering(self,input_file,output_file): + query_snippets = self.generate_the_training_data(input_file) + query_clusters = {} + j =0 + for query in query_snippets: + facet_list = self.extract_facets(facet_extractor,query,query_snippets[query]) + relevant_facets = self.ranking_cluster_threshold(query,query_snippets[query],facet_list) + cluster_labels= self.cluster_facets(self.model.encode(relevant_facets)) + clusters = [[] for _ in range(len(set(cluster_labels)))] + for i,facet in enumerate(relevant_facets): + clusters[cluster_labels[i]].append(facet) + query_clusters[query] = clusters + j+=1 + self.write(query_clusters,output_file) + + def cluster_facets_query(query,snippets,facets): + cluster_labels = self.cluster_facets(self.model.encode(relevant_facets)) + clusters = [[] for _ in range(len(set(cluster_labels)))] + for i,facet in enumerate(relevant_facets): + clusters[cluster_labels[i]].append(facet) + return clusters + + + + + def write(self, query_clusters,output_file): + with open(output_file, "w") as outfile: + for query in query_clusters: + dictionary = { + "query": query, + "facet_clusters": query_clusters[query], + } + json_string = json.dumps(dictionary) + outfile.write(json_string+"\n") + + + + +# logging.basicConfig(filename = "logss2.log", level =logging.INFO, filemode = "w") +# cluster_object = Clustering(model_path) +# cluster_object.run_clustering('dev.jsonl','clusters.txt') +# cluster_object.ranking_cluster_histogram() +# cluster_object.get_average_scores_positives('dev.jsonl') + + +model = SentenceTransformer(model_path) \ No newline at end of file diff --git a/Clustering/subClustering.py b/Clustering/subClustering.py new file mode 100644 index 0000000..e048e2a --- /dev/null +++ b/Clustering/subClustering.py @@ -0,0 +1,108 @@ +import logging +import math +import pickle +import json +import torch +import numpy as np +import random +import collections +from typing import List, Union +from sentence_transformers import SentenceTransformer, InputExample +from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator +from sentence_transformers.losses import MultipleNegativesRankingLoss +from sentence_transformers.models import Pooling, Transformer +from sentence_transformers.util import dot_score +from torch.utils.data import DataLoader +from sklearn.cluster import DBSCAN, KMeans +from symbol import return_stmt +import wandb +import os +import numpy as np +os.environ["NCCL_DEBUG"] = "INFO" +from collections import defaultdict +from torch.cuda.amp import GradScaler, autocast +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm +from transformers import AdamW, get_linear_schedule_with_warmup +from matplotlib import pyplot as plt +from scipy.special import softmax + + +model_path = 'weights_microsoft/mpnet-base' +line_break = '*'*50 + + + +class Clustering: + + def __init__(self, model_path='umass/mpnet-base-mimics-query-facet-encoder'): + self.model = SentenceTransformer(model_path) + + + + def cluster_facets(self,facets_embeddings): + clustering = DBSCAN(eps=0.785,min_samples=1).fit(facets_embeddings) + return clustering.labels_ + + + + def ranking_cluster_threshold(self,query,snippets,facet_list): + batch = [] + data = self.batch_generator(query,snippets,facet_list) + input_ids, attention_mask = data + scores_ = self.ranking_model.forward(input_ids, attention_mask).data.cpu().numpy() + scores_ = 1/(1 + np.exp(-scores_)) + relevant_facets = [] + for i, facet in enumerate(facet_list): + if(scores_[i]>=0.9): + relevant_facets.append(facet) + return relevant_facets + + + + + + # def run_clustering(self,input_file,output_file): + # query_snippets = self.generate_the_training_data(input_file) + # query_clusters = {} + # j =0 + # for query in query_snippets: + # facet_list = self.extract_facets(facet_extractor,query,query_snippets[query]) + # relevant_facets = self.ranking_cluster_threshold(query,query_snippets[query],facet_list) + # cluster_labels= self.cluster_facets(self.model.encode(relevant_facets)) + # clusters = [[] for _ in range(len(set(cluster_labels)))] + # for i,facet in enumerate(relevant_facets): + # clusters[cluster_labels[i]].append(facet) + # query_clusters[query] = clusters + # j+=1 + # self.write(query_clusters,output_file) + + def cluster_facets_query(self,query,snippets,facets): + adder = query + " " + facets = [adder + s for s in facets] + cluster_labels = self.cluster_facets(self.model.encode(facets)) + clusters = [[] for _ in range(len(set(cluster_labels)))] + for i,facet in enumerate(facets): + clusters[cluster_labels[i]].append(facet) + return clusters + + + + + def write(self, query_clusters,output_file): + with open(output_file, "w") as outfile: + for query in query_clusters: + dictionary = { + "query": query, + "facet_clusters": query_clusters[query], + } + json_string = json.dumps(dictionary) + outfile.write(json_string+"\n") + + + + +# cluster_object.ranking_cluster_histogram() +# cluster_object.get_average_scores_positives('dev.jsonl') + diff --git a/Scoring/facet_ranker.py b/Scoring/facet_ranker.py new file mode 100644 index 0000000..4337773 --- /dev/null +++ b/Scoring/facet_ranker.py @@ -0,0 +1,94 @@ +from multiprocessing.dummy import active_children +import os +import torch +import logging +from torch import nn +from torch.nn import DataParallel +import transformers +from torch.utils.data import DataLoader +from transformers import AutoConfig, AutoModel, AutoTokenizer +from Scoring.utils_ranking_model import collate_batch, RankingDataset +from huggingface_hub import hf_hub_url, cached_download, hf_hub_download +import joblib + + +class FacetRanker(nn.Module): + def __init__(self, + model_path="umass/roberta-base-mimics-facet-reranker", + model_type="roberta-base", + use_gpu=True, + parallel=False, + debug=False, max_seq_length= 512): + super(FacetRanker, self).__init__() + + self.model_type = model_type + + configuration = AutoConfig.from_pretrained(self.model_type) + if model_path is None: + # this is the code that will be executed + self.bert = AutoModel.from_pretrained(self.model_type) + else: + self.bert = AutoModel.from_config(configuration) + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_type) + self.hidden_dim = configuration.hidden_size + self.max_seq_length = max_seq_length + self.score = nn.Linear(self.hidden_dim, 1) + + if parallel: + self.bert = DataParallel(self.bert) + if model_path is not None: + model_path = hf_hub_download(repo_id=model_path, filename="model.state_dict") + sdict = torch.load(model_path, map_location=lambda storage, loc: storage) + self.load_state_dict(sdict, strict=False) + self.device = torch.device("cuda") if use_gpu else torch.device("cpu") + self.to(self.device) + self.debug = debug + + + def pairwise_loss(self, batch): + pos_logits = self.forward(batch[0], batch[1]) + neg_logits = self.forward(batch[2], batch[3]) + loss = torch.mean(torch.log(1 + torch.exp(-torch.sub(pos_logits, neg_logits))), dim=0) + return loss + + def pointwise_loss(self, batch): + loss_fn = nn.BCELoss() + scores = torch.sigmoid(self.forward(batch[0], batch[1])) + loss = loss_fn(scores, batch[2].view(-1, 1)) + return loss + + def forward(self, input_ids, attention_mask): + input_ids = input_ids.to(self.device) + attention_mask = attention_mask.to(self.device) + for x in range(len(input_ids)): + if(len(input_ids[x])!=512 or len(attention_mask[x])!=512): + logging.info("the length is not equal to 512 of the input ids or the attention mask in the foreward function {} {}".format(len(input_ids),len(attention_mask))) + cls = self.bert(input_ids=input_ids, attention_mask=attention_mask)[1] + scores = self.score(cls) + return scores + + def score_facets(self, query_facet_snippets, batch_size=8): + dataset = RankingDataset(data=query_facet_snippets, + max_seq_length=self.max_seq_length, + model_type=self.model_type, + tokenizer=self.tokenizer) + dataloader = DataLoader(dataset=dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=collate_batch) + scores = [] + for input_ids, attention_mask in dataloader: + scores_ = self.forward(input_ids, attention_mask).data.cpu().numpy() + #scores_ = [s[0] for s in scores_] + scores.extend(scores_) + return scores + + def load_model(self, sdict): + self.load_state_dict(sdict) + self.to(self.device) + + def save_model(self, output_path): + model_name = 'model.state_dict' + opath = os.path.join(output_path, model_name) + torch.save(self.state_dict(), opath) diff --git a/Scoring/run_train.py b/Scoring/run_train.py new file mode 100644 index 0000000..9eeba29 --- /dev/null +++ b/Scoring/run_train.py @@ -0,0 +1,283 @@ +import logging +import math +import random +from symbol import return_stmt +import wandb +import torch +import os +import numpy as np +os.environ["NCCL_DEBUG"] = "INFO" + +from collections import defaultdict +from torch.cuda.amp import GradScaler, autocast + +from torch.utils.data import DatLoader +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm +from transformers import AdamW, get_linear_schedule_with_warmup + +from Prevfacet_ranker import FacetRanker +from utils_ranking_model import PairwiseRankingDataset, RankingDataset, collate_batch, PairwiseRankingDatasetRandom, PointwiseRankingDataset + + +def evaluate(model, dataloader): + + model.eval() + scores = defaultdict(lambda: []) + for input_ids, attention_mask, qids, labels in tqdm(dataloader, position=1): + scores_ = model.forward(input_ids, attention_mask).data.cpu().numpy() + for i, score in enumerate(scores_): + r = random.random() + scores[qids[i]].append((score, labels[i], r)) + + # for i in range(len(qids)): + # r = random.random() + # scores[qids[i]].append((labels[i], r)) + + avg_ndcg = 0 + avg_len = 0 + mrr = 0 + for _, scores_ in scores.items(): + sorted_scores = sorted(scores_, key=lambda x: x[0], reverse=True) + relevant = 0 + dcg = 0 + mr = 0 + avg_len += len(sorted_scores) + for i, (score,label,rscore) in enumerate(sorted_scores): + if label == 1: + relevant += 1 + dcg += 1 / math.log(2 + i) + if mr == 0: + mr = 1 / (i+1) + idcg = 0 + for i in range(relevant): + idcg += 1 / math.log(2 + i) + + ndcg = dcg / idcg + avg_ndcg += ndcg + mrr += mr + total_queries = len(scores.keys()) + + avg_ndcg /= total_queries + mrr /= total_queries + avg_len /= total_queries + + logging.info("The ndcg and mrr are {},{} with batch size 32 and lr of 1e5 and eval steps of 1000 with generated negative facets".format(avg_ndcg,mrr)) + + model.train() + return avg_ndcg, mrr + + +def train(output_path="weights_ranker", + model_type="distilroberta-base", + use_snippets=True, + train_batch_size=16, + eval_batch_size=24, + lr=1e-5, + accumulation_steps=4, + warmup_steps=1000, + max_seq_length=512, + epochs=3, + eval_steps=1000, + log_steps=20, + train_path="train.jsonl", + dev_path="dev.jsonl", + test_path="test.jsonl", + use_gpu=True, + parallel=True, + fp16=False, + wandb_log=True): + + if wandb_log: + wandb.init(project="facet-extraction") + wandb.config = { + "learning_rate": lr, + "epochs": epochs, + "batch_size": train_batch_size + } + + model = FacetRanker(model_type=model_type, + use_gpu=True, + parallel=parallel, + max_seq_length=max_seq_length) + + + + # train_dataset = PairwiseRankingDataset(data_path=train_path, + # max_seq_length=max_seq_length, + # model_type=model_type, + # tokenizer=model.tokenizer, + # num_negatives=4, + # queries_per_batch=2) + # train_dataloader = train_dataset.batch_generator() + train_dataset = PointwiseRankingDataset(data_path=train_path, + max_seq_length=max_seq_length, + model_type=model_type, + tokenizer=model.tokenizer, + batch_size=train_batch_size, + random_split=0.2) + + train_dataloader = train_dataset.batch_generator() + + + dev_dataset = RankingDataset(data_path=dev_path, + use_snippets=use_snippets, + max_seq_length=max_seq_length, + model_type=model_type, + tokenizer=model.tokenizer) + + dev_dataloader = DataLoader(dataset=dev_dataset, + batch_size=eval_batch_size, + shuffle=False, + collate_fn=collate_batch) + + test_dataset = RankingDataset(data_path=test_path, + use_snippets=use_snippets, + max_seq_length=max_seq_length, + model_type=model_type, + tokenizer=model.tokenizer) + test_dataloader = DataLoader(dataset=test_dataset, + batch_size=eval_batch_size, + shuffle=False, + collate_fn=collate_batch) + + + total_examples = train_dataset.total_examples + optimizer = AdamW(model.parameters(), lr=lr, eps=1e-6, correct_bias=False) + total_steps = math.ceil(total_examples / (train_batch_size * accumulation_steps)) * epochs + scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps) + writer = SummaryWriter() + best_ndcg = 0 + steps = 0 + accumulated_steps = 0 + running_loss = 0.0 + scaler = GradScaler() + for epoch in range(epochs): + iterator = tqdm(train_dataloader, position=0) + theEvaluation =0 + for batch in iterator: + if fp16: + with autocast(): + loss = model.pointwise_loss(batch) + scaler.scale(loss).backward() + else: + + loss = model.pointwise_loss(batch) + + loss.backward() + + + + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + #accumulation steps is the number of steps, we are running it every time + if ((steps + 1) % accumulation_steps == 0) or (steps + 1 == total_steps): + + batch_loss_value = loss.item() + running_loss += batch_loss_value + if fp16: + scaler.step(optimizer) + scaler.update() + else: + optimizer.step() + + scheduler.step() + optimizer.zero_grad() + accumulated_steps += 1 + if accumulated_steps % eval_steps == 0: + with torch.no_grad(): + dev_ndcg, dev_mrr = evaluate(model, dev_dataloader) + test_ndcg, test_mrr = evaluate(model, test_dataloader) + writer.add_scalar("dev_ndcg", dev_ndcg, accumulated_steps) + writer.add_scalar("dev_mrr", dev_mrr, accumulated_steps) + writer.add_scalar("test_ndcg", test_ndcg, accumulated_steps) + writer.add_scalar("test_mrr", test_mrr, accumulated_steps) + if wandb_log: + wandb.log({"dev_ndcg": dev_ndcg}) + wandb.log({"dev_mrr": dev_mrr}) + wandb.log({"test_ndcg": test_ndcg}) + wandb.log({"test_mrr": test_mrr}) + if dev_ndcg > best_ndcg: + best_ndcg = dev_ndcg + model.save_model(output_path=output_path) + # every 10 batchees + if accumulated_steps % log_steps == 0: + writer.add_scalar("loss", running_loss / log_steps, accumulated_steps) + if wandb_log: + wandb.log({"loss": running_loss / log_steps}) + running_loss = 0.0 + + iterator.set_description("loss: {}, acc_steps: {}/{}".format(batch_loss_value, + accumulated_steps, + total_steps)) + + steps += 1 + + +def eval(model_type="distilroberta-base", + eval_batch_size=16, + max_seq_length=512, + dev_path="dev.jsonl", + test_path="test_scoring.jsonl", + use_gpu=True, + parallel=True): + + model = FacetRanker(model_path="models/weights_ranker_5", + model_type=model_type, + use_gpu=use_gpu, + parallel=parallel, + max_seq_length=max_seq_length) + + + # dev_dataset = RankingDataset(data_path=dev_path, + # max_seq_length=max_seq_length, + # model_type=model_type, + # tokenizer=model.tokenizer) + # #wrapping the dataset using the dataloader + + # dev_dataloader = DataLoader(dataset=dev_dataset, + # batch_size=eval_batch_size, + # shuffle=False, + # collate_fn=collate_batch) + + + test_dataset = RankingDataset(data_path=test_path, + max_seq_length=max_seq_length, + model_type=model_type, + tokenizer=model.tokenizer) + + test_dataloader = DataLoader(dataset=test_dataset, + batch_size=eval_batch_size, + shuffle=False, + collate_fn=collate_batch) + # dev_ndcg, dev_mrr = evaluate(model, dev_dataloader) + test_ndcg, test_mrr = evaluate(model, test_dataloader) + return test_ndcg, test_mrr + + + +if __name__ == "__main__": + wandb.init() + logging.basicConfig(filename = "logss.log", level =logging.INFO, filemode = "w") + logging.info("The run name on wandb is {}".format(wandb.run.name)) + + # train(output_path="weights_ranker_5", + # model_type="distilroberta-base", + # use_snippets=True, + # train_batch_size=32, + # eval_batch_size=32, + # accumulation_steps=1, + # lr=1e-5, + # warmup_steps=100, + # max_seq_length=512, + # epochs=1000, + # eval_steps=1000, + # log_steps=10, + # wandb_log=True, + # train_path="train.jsonl", + # dev_path="dev.jsonl", + # test_path="test.jsonl", + # use_gpu=True, + # parallel=True, + # fp16=False) + + eval() diff --git a/Scoring/utils_ranking_model.py b/Scoring/utils_ranking_model.py new file mode 100644 index 0000000..62da045 --- /dev/null +++ b/Scoring/utils_ranking_model.py @@ -0,0 +1,640 @@ +import json +import logging +import random +import re +from collections import defaultdict +from sys import exit + +import torch + +from typing import List, Optional +from torch.utils.data import Dataset, DataLoader +from transformers import PreTrainedTokenizer, AutoTokenizer +from syntok.tokenizer import Tokenizer + +#this funciton return the input id and the attention mask +def convert_example_to_features( + text: str, + max_seq_length: int, + tokenizer: PreTrainedTokenizer, + cls_token_at_end=False, + cls_token="[CLS]", + use_segment_ids=False, + cls_token_segment_id=0, + sep_token="[SEP]", + sep_token_extra=True, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=11, + mask_padding_with_zero=True, +): +#this function return the + #it tokenizes the text + tokens_ = tokenizer.tokenize(text) + + # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. + #getting the number of extra tokens, that is 2 + special_tokens_count = tokenizer.num_special_tokens_to_add() + + # changes the length to max sequence length + if len(tokens_) > max_seq_length - special_tokens_count: + tokens_ = tokens_[: (max_seq_length - special_tokens_count)] + + # replacing the || with the separating token and getting the arrray of the segment ids + tokens = [] + segment_ids = [] + segment_id = 0 + for word in tokens_: + segment_ids.append(segment_id) + if word == "Ġ||": + tokens.append(sep_token) + segment_id += 1 + else: + tokens.append(word) + #adding the extra + tokens += [sep_token] + segment_ids.append(segment_id) + + #not doing anything + if sep_token_extra: + # roberta uses an extra separator b/w pairs of sentences + tokens += [sep_token] + segment_ids.append(segment_id) + + + # adding the cls token + + if cls_token_at_end: + tokens += [cls_token] + segment_ids += [cls_token_segment_id] + else: + tokens = [cls_token] + tokens + segment_ids = [cls_token_segment_id] + segment_ids + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + padding_length = max_seq_length - len(input_ids) + + if pad_on_left: + input_ids = ([pad_token] * padding_length) + input_ids + attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask + segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids + else: + input_ids += [pad_token] * padding_length + attention_mask += [0 if mask_padding_with_zero else 1] * padding_length + segment_ids += [pad_token_segment_id] * padding_length + if(len(input_ids)!=max_seq_length or len(attention_mask)!=max_seq_length): + logging.info("The length of inputs is wrong from the convert function{} {}".format(len(input_ids),len(attention_mask))) + assert len(input_ids) == max_seq_length + assert len(attention_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + if "token_type_ids" not in tokenizer.model_input_names: + segment_ids = None + + if use_segment_ids: + return input_ids, attention_mask, segment_ids + else: + return input_ids, attention_mask + + +class PairwiseRankingDataset: + def __init__( + self, + data_path: str, + tokenizer: PreTrainedTokenizer, + model_type: str, + max_seq_length: Optional[int] = None, + num_negatives=8, + queries_per_batch=8, + epochs=3 + ): + self.tokenizer = tokenizer + self.max_seq_length = max_seq_length + self.model_type = model_type + self.num_negatives = num_negatives + self.queries_per_batch = queries_per_batch + self.epochs = epochs + self.epoch = 0 + self.seg_tokenizer = Tokenizer() + # logging.info("Loading examples from {}...".format(data_path)) + + self.qid_index = 0 + self.qids = [] + self.qids_pos_index = [] + + self.queries = [] + self.snippets = [] + self.positives = [] + self.negatives = [] + self.total_examples = 0 + with open(data_path, encoding="utf-8") as f: + for query_id, line in enumerate(f): + d = json.loads(line.rstrip("\n")) + self.qids.append(query_id) + self.queries.append(d["query"]) + self.snippets.append(" || ".join(d["snippets"])) + self.positives.append(d["groundtruth"]) + self.negatives.append(d["negatives"]) + self.total_examples += len(d["groundtruth"])*min(num_negatives, len(d["negatives"])) + + self.shuffle_dataset() + + def shuffle_dataset(self): + self.epoch += 1 + self.qid_index = 0 + logging.info("Shuffling dataset...") + random.shuffle(self.qids) + self.qids_pos_index = [-1] * len(self.qids) + for i in range(len(self.qids)): + random.shuffle(self.positives[i]) + random.shuffle(self.negatives[i]) + + + def generate_negative_facets(self,snippets): + negatives = [] + if snippets=="": + return negatives + snippets_ = snippets.split(" || ") + for itofNeg in range(5): + n_length = random.randint(1, 5) + j = 0 + while j < 100: + j += 1 + snippet = random.choice(snippets_) + tokens = list(self.seg_tokenizer.tokenize(snippet)) + index = random.randint(0, max(0,len(tokens) - n_length)) + negative_tokens = tokens[index:index + n_length] + negative = " ".join([t.value for t in negative_tokens]).lower() + if re.match('^[a-z0-9\- ]+$', negative): + negatives.append(negative) + break + if j == 100: + return negatives + return negatives + + def get_next_ranklist(self): + self.qids_pos_index[self.qid_index] += 1 + while self.qids_pos_index[self.qid_index] == len(self.positives[self.qids[self.qid_index]]): + self.qid_index += 1 + if self.qid_index == len(self.qids): + self.shuffle_dataset() + # negatives = self.negatives[self.qids[self.qid_index]] + + negatives = self.generate_negative_facets(self.snippets[self.qids[self.qid_index]]) + + + return self.qids[self.qid_index],self.positives[self.qids[self.qid_index]][self.qids_pos_index[self.qid_index]], random.sample(negatives, min(self.num_negatives, len(negatives))) + + def batch_generator(self): + while self.epoch <= self.epochs: + batch = [] + for _ in range(self.queries_per_batch): + query_id, positive, negatives = self.get_next_ranklist() + # if self.epoch > self.epochs: + # return + query = self.queries[query_id] + snippets = self.snippets[query_id] + + positive_text = "{} || {} || {}".format(query, positive, snippets) + pos_features = convert_example_to_features( + positive_text, + self.max_seq_length, + self.tokenizer, + cls_token_at_end=bool(self.model_type in ["xlnet"]), + cls_token=self.tokenizer.cls_token, + cls_token_segment_id=2 if self.model_type in ["xlnet"] else 0, + sep_token=self.tokenizer.sep_token, + sep_token_extra=False, + pad_on_left=bool(self.tokenizer.padding_side == "left"), + pad_token=self.tokenizer.pad_token_id, + pad_token_segment_id=self.tokenizer.pad_token_type_id) + + for negative in negatives: + negative_text = "{} || {} || {}".format(query, negative, snippets) + neg_features = convert_example_to_features( + negative_text, + self.max_seq_length, + self.tokenizer, + cls_token_at_end=bool(self.model_type in ["xlnet"]), + cls_token=self.tokenizer.cls_token, + cls_token_segment_id=2 if self.model_type in ["xlnet"] else 0, + sep_token=self.tokenizer.sep_token, + sep_token_extra=False, + pad_on_left=bool(self.tokenizer.padding_side == "left"), + pad_token=self.tokenizer.pad_token_id, + pad_token_segment_id=self.tokenizer.pad_token_type_id) + features = pos_features + neg_features + batch.append(features) + yield collate_batch(batch) + + +class PointwiseRankingDataset: + def __init__( + self, + data_path: str, + tokenizer: PreTrainedTokenizer, + model_type: str, + max_seq_length: Optional[int] = None, + batch_size=8, + random_split=0.5 + ): + self.tokenizer = tokenizer + self.seg_tokenizer = Tokenizer() + self.max_seq_length = max_seq_length + self.model_type = model_type + self.batch_size = batch_size + self.random_split = random_split + + logging.info("the random split is {}".format(random_split)) + self.qids = [] + self.pos_index = defaultdict(lambda: 0) + self.neg_index = defaultdict(lambda: 0) + + self.queries = [] + self.snippets = [] + self.positives = [] + self.negatives = [] + self.total_examples = 0 + #just parsing the data of + with open(data_path, encoding="utf-8") as f: + for query_id, line in enumerate(f): + d = json.loads(line.rstrip("\n")) + self.qids.append(query_id) + self.queries.append(d["query"]) + self.snippets.append(" || ".join(d["snippets"])) + self.positives.append(d["groundtruth"]) + self.negatives.append(d["negatives"]) + self.total_examples += len(d["groundtruth"]) + len(d["negatives"]) + + + def batch_generator(self): + while True: + + batch = [] + for i in range(self.batch_size): + query_id = random.randint(0, len(self.queries) - 1) + query = self.queries[query_id] + snippets = "" + r = random.random() + if(r= self.random_split: + # if len(self.negatives[query_id]) == 0: + # not_found = True + # else: + # facet = self.negatives[query_id][self.neg_index[query_id]] + # self.neg_index[query_id] = (self.neg_index[query_id] + 1) % len(self.negatives[query_id]) + # else: + # neg_qid = random.randint(0, len(self.queries) - 1) + # while len(self.negatives[neg_qid]) < 2: + # neg_qid = random.randint(0, len(self.queries) - 1) + # neg_id = random.randint(0, len(self.negatives[neg_qid]) - 1) + # facet = self.negatives[neg_qid][neg_id] + if i % 2 == 0: + if snippets=="": + not_found = True + else: + n_length = random.randint(1, 5) + snippets_ = snippets.split(" || ") + j = 0 + while j < 100: + j += 1 + snippet = random.choice(snippets_) + tokens = list(self.seg_tokenizer.tokenize(snippet)) + index = random.randint(0, max(0,len(tokens) - n_length)) + negative_tokens = tokens[index:index + n_length] + negative = " ".join([t.value for t in negative_tokens]).lower() + if re.match('^[a-z0-9\- ]+$', negative): + facet = negative + break + if j == 100: + not_found = True + if not_found or i % 2 == 1: + label = 1.0 + facet = self.positives[query_id][self.pos_index[query_id]] + self.pos_index[query_id] = (self.pos_index[query_id] + 1) % len(self.positives[query_id]) + + text = "{} || {} || {}".format(query, facet, snippets) + + features = convert_example_to_features( + text, + self.max_seq_length, + self.tokenizer, + cls_token_at_end=bool(self.model_type in ["xlnet"]), + cls_token=self.tokenizer.cls_token, + cls_token_segment_id=2 if self.model_type in ["xlnet"] else 0, + sep_token=self.tokenizer.sep_token, + sep_token_extra=False, + pad_on_left=bool(self.tokenizer.padding_side == "left"), + pad_token=self.tokenizer.pad_token_id, + pad_token_segment_id=self.tokenizer.pad_token_type_id) + features = features + (label,) + batch.append(features) + #logging.info("the Batch length from the collate batch pointwise {}".format(len(batch))) + yield collate_batch(batch, all_features=True) + + +class PairwiseRankingDatasetRandom(Dataset): + def __init__( + self, + data_path: str, + tokenizer: PreTrainedTokenizer, + model_type: str, + max_seq_length: Optional[int] = None, + random_split=0.5, + use_snippets=True + ): + self.tokenizer = tokenizer + self.seg_tokenizer = Tokenizer() + self.max_seq_length = max_seq_length + self.model_type = model_type + self.random_split = random_split + self.use_snippets = use_snippets + + self.queries = [] + self.snippets = [] + self.positives = [] + self.negatives = [] + self.examples = [] + with open(data_path, encoding="utf-8") as f: + for query_id, line in enumerate(f): + d = json.loads(line.rstrip("\n")) + self.queries.append(d["query"]) + self.snippets.append(" || ".join(d["snippets"])) + self.positives.append(d["groundtruth"]) + self.negatives.append(d["negatives"]) + for i in range(len(self.positives[-1])): + for j in range(len(self.negatives[-1])): + self.examples.append((query_id, i, j)) + + def __len__(self): + return len(self.examples) + + def __getitem__(self, i): + example = self.examples[i] + query_id = example[0] + pos_id = example[1] + neg_id = example[2] + + query = self.queries[query_id] + snippets = self.snippets[query_id] + positive = self.positives[query_id][pos_id] + + r = random.random() + if r < self.random_split: + snippets_ = snippets.split(" || ") + snippet = random.choice(snippets_) + tokens = list(self.seg_tokenizer.tokenize(snippet)) + j = 0 + while j < 100: + j += 1 + length = random.randint(1, 5) + index = random.randint(0, max(0,len(tokens) - length)) + negative_tokens = tokens[index:index + length] + negative = " ".join([t.value for t in negative_tokens]).lower() + if re.match('^[a-z0-9\- ]+$', negative): + break + if j == 100: + negative = self.negatives[query_id][neg_id] + else: + negative = self.negatives[query_id][neg_id] + + if self.use_snippets: + positive_text = "{} || {} || {}".format(query, positive, snippets) + else: + positive_text = "{} || {}".format(query, positive) + + pos_features = convert_example_to_features( + positive_text, + self.max_seq_length, + self.tokenizer, + cls_token_at_end=bool(self.model_type in ["xlnet"]), + cls_token=self.tokenizer.cls_token, + cls_token_segment_id=2 if self.model_type in ["xlnet"] else 0, + sep_token=self.tokenizer.sep_token, + sep_token_extra=False, + pad_on_left=bool(self.tokenizer.padding_side == "left"), + pad_token=self.tokenizer.pad_token_id, + pad_token_segment_id=self.tokenizer.pad_token_type_id) + + if self.use_snippets: + negative_text = "{} || {} || {}".format(query, negative, snippets) + else: + negative_text = "{} || {}".format(query, negative) + + neg_features = convert_example_to_features( + negative_text, + self.max_seq_length, + self.tokenizer, + cls_token_at_end=bool(self.model_type in ["xlnet"]), + cls_token=self.tokenizer.cls_token, + cls_token_segment_id=2 if self.model_type in ["xlnet"] else 0, + sep_token=self.tokenizer.sep_token, + sep_token_extra=False, + pad_on_left=bool(self.tokenizer.padding_side == "left"), + pad_token=self.tokenizer.pad_token_id, + pad_token_segment_id=self.tokenizer.pad_token_type_id) + + features = pos_features + neg_features + return features + + +class RankingDataset(Dataset): + def __init__( + self, + tokenizer: PreTrainedTokenizer, + model_type: str, + data_path: Optional[str] = None, + data: Optional[List] = None, + max_seq_length: Optional[int] = None, + use_snippets=True + ): + self.tokenizer = tokenizer + self.max_seq_length = max_seq_length + self.model_type = model_type + self.use_snippets = use_snippets + self.seg_tokenizer = Tokenizer() + + if data_path is not None: + self.from_file = True + self.queries = [] + self.facets = [] + self.snippets = [] + self.examples = [] + with open(data_path, encoding="utf-8") as f: + for query_id, line in enumerate(f): + d = json.loads(line.rstrip("\n")) + self.queries.append(d["query"]) + self.snippets.append(" || ".join(d["snippets"])) + #negatives = self.generate_negative_facets(self.snippets[-1]) + query_facets = d["groundtruth"] + d['negatives'] + pos_labels = len(d["groundtruth"]) + self.facets.append(query_facets) + for facet_id, facet in enumerate(query_facets): + label = 1 + if facet_id >= pos_labels: + label = 0 + self.examples.append((query_id,facet_id,len(self.snippets) - 1,label)) + + else: + self.from_file = False + self.examples = data + + def generate_negative_facets(self,snippets): + negatives = [] + if snippets=="": + return negatives + snippets_ = snippets.split(" || ") + for itofNeg in range(10): + n_length = random.randint(1, 5) + j = 0 + while j < 100: + j += 1 + snippet = random.choice(snippets_) + tokens = list(self.seg_tokenizer.tokenize(snippet)) + index = random.randint(0, max(0,len(tokens) - n_length)) + negative_tokens = tokens[index:index + n_length] + negative = " ".join([t.value for t in negative_tokens]).lower() + if re.match('^[a-z0-9\- ]+$', negative): + negatives.append(negative) + break + if j == 100: + return negatives + return negatives + + + def __len__(self): + return len(self.examples) + + # this will return the ith item of the development + def __getitem__(self, i): + example = self.examples[i] + + if not self.from_file: + if self.use_snippets: + text = "{} || {} || {}".format(example[0], example[1], example[2]) + else: + text = "{} || {}".format(example[0], example[1]) + else: + if self.use_snippets: + text = "{} || {} || {}".format(self.queries[example[0]], + self.facets[example[0]][example[1]], + self.snippets[example[2]]) + + else: + text = "{} || {}".format(self.queries[example[0]], + self.facets[example[1]]) + + features = convert_example_to_features( + text, + self.max_seq_length, + self.tokenizer, + cls_token_at_end=bool(self.model_type in ["xlnet"]), + cls_token=self.tokenizer.cls_token, + cls_token_segment_id=2 if self.model_type in ["xlnet"] else 0, + sep_token=self.tokenizer.sep_token, + sep_token_extra=False, + pad_on_left=bool(self.tokenizer.padding_side == "left"), + pad_token=self.tokenizer.pad_token_id, + pad_token_segment_id=self.tokenizer.pad_token_type_id) + (input_ids, attention_mask) = features + + if self.from_file: + return features + (example[0], example[3],) # input ids, mask, qid, label + + else: + return features # input ids, mask + + +def collate_batch(batch, all_features=False): + #this funtion get called from the iterators of the function, and returns tensors + num_features = len(batch[0]) + + coll_batch = [[] for _ in range(num_features)] + + for sample in batch: + for i, x in enumerate(sample): + coll_batch[i].append(x) + + for i in range(num_features): + if all_features or isinstance(coll_batch[i][0], list): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + t = torch.tensor(coll_batch[i]).to(torch.device(device)) + coll_batch[i] = t + + + return coll_batch + +def run(): + logging.getLogger().setLevel(logging.INFO) + tokenizer = AutoTokenizer.from_pretrained("roberta-base") + dataset = PointwiseRankingDataset(data_path="test.jsonl", + max_seq_length=512, + model_type="distilroberta-base", + tokenizer=tokenizer, + batch_size=4) + dataloader = dataset.batch_generator() + """ + dataloader = DataLoader(dataset=dataset, + batch_size=8, + shuffle=True, + collate_fn=collate_batch) + """ + + + + + # if i % 2 == 0: + # r = random.random() + # if r >= self.random_split: + # if len(self.negatives[query_id]) == 0: + # not_found = True + # else: + # facet = self.negatives[query_id][self.neg_index[query_id]] + # self.neg_index[query_id] = (self.neg_index[query_id] + 1) % len(self.negatives[query_id]) + # else: + + # r = random.random() + # if r < 0.5: + # neg_qid = random.randint(0, len(self.queries) - 1) + # while len(self.negatives[neg_qid]) < 2: + # neg_qid = random.randint(0, len(self.queries) - 1) + # neg_id = random.randint(0, len(self.negatives[neg_qid]) - 1) + # facet = self.negatives[neg_qid][neg_id] + # else: + # snippets_ = snippets.split(" || ") + # j = 0 + # while j < 100: + # j += 1 + # snippet = random.choice(snippets_) + # tokens = list(self.seg_tokenizer.tokenize(snippet)) + # length = random.randint(1, 5) + # index = random.randint(0, len(tokens) - 1) + # negative_tokens = tokens[index:index + length] + # negative = " ".join([t.value for t in negative_tokens]).lower() + # if re.match('^[a-z0-9\- ]+$', negative): + # facet = negative + # break + # if j == 100: + # if len(self.negatives[query_id]) == 0: + # not_found = True + # else: + # facet = self.negatives[query_id][self.neg_index[query_id]] + # self.neg_index[query_id] = (self.neg_index[query_id] + 1) % len(self.negatives[query_id]) \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000..1256e32 --- /dev/null +++ b/app.py @@ -0,0 +1,25 @@ +from flask import Flask, jsonify +from sentence_transformers import SentenceTransformer, InputExample +from huggingface_hub import utils,whoami,HfFolder,create_repo, HfApi +from huggingface_hub.utils import validate_repo_id +from huggingface_hub import HfApi +from faspect import Faspect +from trial import trial + +app = Flask(__name__) + + + + + +@app.route('/') +def hello(): + # ans = trial() + # print(ans) + return "jsonify(ans)" + +@app.route('/predict', methods=['POST']) +def predict(): + ans = trial() + print(ans) + return jsonify(ans) \ No newline at end of file diff --git a/faspect.py b/faspect.py index 9bbbe94..5a03ec6 100644 --- a/faspect.py +++ b/faspect.py @@ -4,12 +4,61 @@ from itertools import cycle, islice from flask import request, Flask from flask_cors import CORS - from models.ranking import FacetDiversifier from models.extractive.unsupervised.unsupervised import UnsupervisedFacetExtractor from models.abstractive.seq2seq import SupervisedFacetExtractorSeq2seq from models.extractive.tagging.tagging import SupervisedFacetExtractorTagging from models.classification.facet_classification import FacetClassifier +from Scoring.facet_ranker import FacetRanker +from Scoring.utils_ranking_model import convert_example_to_features, collate_batch +from transformers import AutoConfig, AutoModel, AutoTokenizer +import numpy as np +from Clustering.subClustering import Clustering +from Clarifying_questions.Main_trainer_load_Faspect import Clarifying_question + +def batch_generator(query,snippets,facet_list,model_type="distilroberta-base"): + tokenizer = AutoTokenizer.from_pretrained(model_type) + batch = [] + for facet in facet_list: + text = "{} || {} || {}".format(query,facet,snippets) + features = convert_example_to_features( + text, + 400, + tokenizer, + cls_token_at_end=bool(model_type in ["xlnet"]), + cls_token=tokenizer.cls_token, + cls_token_segment_id=2 if model_type in ["xlnet"] else 0, + sep_token=tokenizer.sep_token, + sep_token_extra=False, + pad_on_left=bool(tokenizer.padding_side == "left"), + pad_token=tokenizer.pad_token_id, + pad_token_segment_id=tokenizer.pad_token_type_id) + batch.append(features) + return collate_batch(batch, all_features=True) + +def threshold(threshold_model,query,facet_list,snippets): + data = batch_generator(query,snippets,facet_list) + input_ids, attention_mask = data + scores_ = threshold_model.forward(input_ids, attention_mask).data.cpu().numpy() + scores_ = 1/(1 + np.exp(-scores_)) + relevant_facets = [] + for i, facet in enumerate(facet_list): + if(scores_[i]>=0.9): + relevant_facets.append(facet) + return relevant_facets + + +def generate_prediction(seed_line, tokenizer, model_): + # Put the model on eval mode + model_.to("cuda") + model_.eval() + + prompt_line_tokens = tokenizer(seed_line, max_length = 192, return_tensors = "pt", padding=True,truncation = True) + + line = model_.generate_text(prompt_line_tokens, eval_beams = 8) + + + return line def roundrobin(*iterables): @@ -50,6 +99,12 @@ def __init__(self): self.unsupervised_extractor = UnsupervisedFacetExtractor() self.ranker = FacetDiversifier(model_name="algoprog/mimics-query-facet-encoder-mpnet-base") + self.threshold_model = FacetRanker(model_path="umass/roberta-base-mimics-facet-reranker", + model_type="distilroberta-base", + use_gpu=True, + parallel=True,max_seq_length=512) + self.clustering_model = Clustering('umass/mpnet-base-mimics-query-facet-encoder') + self.clarifying_question_model = Clarifying_question("umass/bart-base-mimics-question-generation") logging.info("Finished loading.") @@ -57,7 +112,7 @@ def __init__(self): CORS(self.app) def extract_facets(self, query, docs, - aggregation="round-robin", + aggregation="threshold", mmr_lambda=0.5, classification_threshold=0.05, classification_topk=0): @@ -91,12 +146,27 @@ def extract_facets(self, query, docs, facets = self.ranker.maximal_marginal_relevance(query, facets, lamda=mmr_lambda) elif aggregation == "rank": facets = self.ranker.maximal_marginal_relevance(query, facets, lamda=1.0) + + # elif aggregation == "threshold": + # facets = threshold(self.threshold_model,query,facets,docs) facets = [f.lower() for f in facets] facets = remove_duplicates(facets) return facets + def cluster_facets(self,query,snippets,facets): + clusters = self.clustering_model.cluster_facets_query(query,snippets,facets) + return clusters + + def generate_clarifying_questions(self,query,snippets,facets): + facets = ", ".join(facets) + snippets = " | ".join(snippets) + seed_line = query + " | " + facets + + question_pred = self.clarifying_question_model.generate_prediction(seed_line = seed_line) + return question_pred + def build_endpoints(self): @self.app.route("/extract", methods=["GET", "POST"]) def search_endpoint(): diff --git a/request.py b/request.py new file mode 100644 index 0000000..95f8890 --- /dev/null +++ b/request.py @@ -0,0 +1,5 @@ +import requests + +resp = requests.post("http://localhost:8000/predict") +print(resp.text) +print(resp.json()) \ No newline at end of file diff --git a/run_Faspect.py b/run_Faspect.py new file mode 100644 index 0000000..8152ea8 --- /dev/null +++ b/run_Faspect.py @@ -0,0 +1,65 @@ +# from faspect import Faspect +from sentence_transformers import SentenceTransformer, InputExample +from huggingface_hub import utils,whoami,HfFolder,create_repo, HfApi +from huggingface_hub.utils import validate_repo_id +from huggingface_hub import HfApi +from faspect import Faspect +# api = HfApi() +# api.upload_folder( +# repo_id="umass/roberta-base-mimics-facet-reranker", +# folder_path='weights_ranker_5/model.state_dict', +# repo_type="model", +# ) +# print("done") + + +# repo_name = "clustering-Model-3" + + +# model = SentenceTransformer('models/Clustering/weights_microsoft/mpnet-base') +# url = model.save_to_hub(repo_name = repo_name,private=None) +# print(url) +# facet_extractor = Faspect() +# def trial(): + +# query = "cars" +# documents = ["Shop new & used cars, research & compare models, find local dealers/sellers,calculate payments, value your car, sell/trade in your car & more at Cars.com.", +# "Search over 48789 used Cars in Amherst, MA. TrueCar has over 861570 listings nationwide, updated daily. Come find a great deal on used Cars in Amherst today!", +# "Cars is a 2006 American computer-animated sports comedy film produced by Pixar Animation Studios and released by Walt Disney Pictures.", +# "Search for used cars at carmax.com. Use our car search or research makes and models with customer reviews, expert reviews, and more."] +# facets = facet_extractor.extract_facets(query, +# documents, +# aggregation="threshold", # mmr, round-robin +# mmr_lambda=0.5, +# classification_threshold=0.05, +# classification_topk=0) +# clusters = facet_extractor.cluster_facets(query,documents,facets) +# thisDict = {} +# for each in clusters: +# thisDict[" ".join(each)] = facet_extractor.generate_clarifying_questions(query,documents,each) +# return thisDict + + +facet_extractor = Faspect() + +query = "cars" + +documents = ["Shop new & used cars, research & compare models, find local dealers/sellers,calculate payments, value your car, sell/trade in your car & more at Cars.com.", + "Search over 48789 used Cars in Amherst, MA. TrueCar has over 861570 listings nationwide, updated daily. Come find a great deal on used Cars in Amherst today!", + "Cars is a 2006 American computer-animated sports comedy film produced by Pixar Animation Studios and released by Walt Disney Pictures.", + "Search for used cars at carmax.com. Use our car search or research makes and models with customer reviews, expert reviews, and more."] +facets = facet_extractor.extract_facets(query, + documents, + aggregation="threshold", # mmr, round-robin + mmr_lambda=0.5, + classification_threshold=0.05, + classification_topk=0) +clusters = facet_extractor.cluster_facets(query,documents,facets) + + +print("facets:", facets) + +print(clusters) + +for each in clusters: + print(facet_extractor.generate_clarifying_questions(query,documents,each))