Would it really be possible to have a single model that could do custom NER and QA on the same data? Can T5 save the day? | by Martin Keywood | Dec, 2020

[ad_1]


Martin Keywood
Photo by Martin Boose from FreeImages
Selegiline     B-Chemical
induced O
postural B-Disease
hypotension I-Disease
in O
Parkinson B-Disease
' I-Disease
s I-Disease
disease I-Disease
: O
a O
longitudinal O
study O
on O
the O
effects O
of O
drug O
withdrawal O
. O
import pandas as pd
import json
from sklearn.model_selection import train_test_split
def parseNER(category):
with open(category+'.tsv', 'r') as f:
data = f.readlines()
input_text = ""
input_text_array = []
target_text = ""
target_text_array = []
input_span = ""for line in data:
line = line.strip()
if len(line):
word, label = line.split('t')
input_text += word + " "
if label != "O":
input_span += word + " "
target_span = ";" if "Disease" in label else "*"
else:
input_span = input_span.strip()
if len(input_span):
target_text += input_span + target_span + " "
input_span = ""
else:
# Because each token is a seperate word then just tidy up apostrophes
input_text_array.append(input_text.strip().replace(" ' s ", "'s "))
input_text = ""
target_text_array.append(target_text.strip().replace(" ' s ", "'s "))
target_text = ""
tmp_df = pd.DataFrame({
'prefix': ["ner" for i in range(len(input_text_array))],
'input_text': input_text_array,
'target_text': target_text_array
})
return tmp_df
ner_train_df = parseNER('train')
ner_eval_df = parseNER('eval')
# limit it to 10000 of the ~85000 squad question
qa_train_df = pd.read_csv('squad.csv').sample(10000)
qa_train_df, qa_eval_df = train_test_split(qa_train_df, test_size=0.2)
train_df = pd.concat([ner_train_df, qa_train_df]).astype(str)
eval_df = pd.concat([ner_eval_df, qa_eval_df]).astype(str)
from simpletransformers.t5 import T5Modeltrain_df = pd.read_csv("train_clean_ner.tsv", sep="t").astype(str)
eval_df = pd.read_csv("eval_clean_ner.tsv", sep="t").astype(str)
model_args = {
"max_seq_length": 196,
"train_batch_size": 8,
"eval_batch_size": 8,
"num_train_epochs": 1,
"evaluate_during_training": True,
"evaluate_during_training_steps": 15000,
"evaluate_during_training_verbose": True,

"learning_rate": 1e-4,

"evaluate_generated_text": True,

"use_multiprocessing": False,
"fp16": True,

"save_steps": -1,
"save_eval_checkpoints": False,
"save_model_every_epoch": False,

"reprocess_input_data": True,
"overwrite_output_dir": True,

"wandb_project": None
}

model = T5Model("t5", "t5-base", args=model_args)
model.train_model(train_df, eval_data=eval_df)
def predict(questions):

model.args.silent=True
predictions = model.predict(to_predict)
model.args.silent=False

all_diags = []
all_meds = []
for i, pred in enumerate(predictions):
# print(tmp)
tmp = to_predict[i].split(":")
task = tmp[0]

if task =="ner":
for p in pred:
words = p.split(" ")
current_pred = ''
for w in words:
if w != '' and w != "nan":
current_pred += w.strip() + " "
if current_pred[-2] == ";" or current_pred[-2] == "*":
pred_type = current_pred[-2]
current_pred = current_pred[:-2]
if pred_type == ";" and current_pred not in all_diags: all_diags.append(current_pred)
if pred_type == "*" and current_pred not in all_meds: all_meds.append(current_pred)
current_pred = ''
if task =="question":
question = tmp[1][:-7].strip()
print(question)
print(pred[0])
print()
print('NER: Diseases:',all_diags)
print('NER: Medication:',all_meds)
to_predict = [
"ner: Heart disease, a new house, a dose of penicillin, and bowel cancer. The diagnosis of COPD, a flashy new car and a skin rash.",
"question: what is name? context: hello, I live in Nottingham and my name is Martin. I am 48 years old",
"question: where do I live? context: hello, I live in Nottingham and my name is Martin. I am 48 years old",
"question: how old am I? context: hello, I live in Nottingham and my name is Martin. I am 48 years old",
]predict(to_predict)
ehr_text = """
PROCEDURE PERFORMED: Excisional breast biopsy with needle localization.
ANESTHESIA: General.
PROCEDURE: After informed consent was obtained, the patient was brought to the radiology suite where needle localization was performed with mammographic guidance. I reviewed the localizing films with the radiologist, and the patient was then brought to the operative suite and placed supine on the operating table. General endotracheal anesthesia was induced without incident. The patient was prepped and draped in the usual sterile manner.
"""
to_predict = [
"ner: "+ehr_text,
"question: what procedure was performed? context: "+ehr_text,
"question: who reviewed the localizing film? context:"+ehr_text
]
predict(to_predict)
to_predict = [
"question: what procedure was performed? context: "+ehr_text,
"question: who reviewed the localizing film? context:"+ehr_text,
"question: who reviewed the film? context:"+ehr_text,
"question: what anesthetic was given? context:"+ehr_text
]
predict(to_predict)
from simpletransformers.t5 import T5Modelmodel_args = {
"overwrite_output_dir": True,
"max_seq_length": 196,
"eval_batch_size": 32,
"num_train_epochs": 1,
"use_multiprocessing": False,
"num_beams": None,
"do_sample": True,
"max_length": 50,
"top_k": 50,
"top_p": 0.95,
"num_return_sequences": 3,
"evaluate_generated_text": True
}
# Load the newly trained model with higher number of SQuAD samples and a lower Learning Rate
model = T5Model("t5", "ner_t5_50k_1_epoch_1e-4", args=model_args)
ehr_text = """
PROCEDURE PERFORMED: Excisional breast biopsy with needle localization.
ANESTHESIA: General.
PROCEDURE: After informed consent was obtained, the patient was brought to the radiology suite where needle localization was performed with mammographic guidance. I reviewed the localizing films with the radiologist, and the patient was then brought to the operative suite and placed supine on the operating table. General endotracheal anesthesia was induced without incident. The patient was prepped and draped in the usual sterile manner.
"""
to_predict = [
"ner: "+ehr_text,
"question: what procedure was performed? context: "+ehr_text,
"question: who reviewed the localizing film? context:"+ehr_text,
"question: who reviewed the film? context:"+ehr_text,
"question: what anesthetic was given? context:"+ehr_text
]
predict(to_predict)
ehr_text = """ 
2-D M-MODE:
1. Left atrial enlargement with left atrial diameter of 4.7 cm.
2. Normal size right and left ventricle.
3. Normal LV systolic function with left ventricular ejection fraction of 51%.
4. Normal LV diastolic function.
5. No pericardial effusion.
6. Normal morphology of aortic valve, mitral valve, tricuspid valve, and pulmonary valve.
7. PA systolic pressure is 36 mmHg.
DOPPLER:
1. Mild mitral and tricuspid regurgitation.
2. Trace aortic and pulmonary regurgitation."""
to_predict = [
"ner: "+ehr_text,
"question: what was the Systolic Blood Pressure? context: "+ehr_text,
"question: what was the diameter of the left atrial? context: "+ehr_text
]
predict(to_predict)
to_predict = [
“question: what was the Blood Pressure? context: “+ehr_text,
“question: what was the BP? context: “+ehr_text
]
predict(to_predict)
to_predict = [
"question: what is the medication used for? context: Heart disease, a new house and bowel cancer. The diagnosis of COPD, a flashy new car and a dose of penicillin to fix a skin rash.",
"question: how am I addressing the irritation? context: Heart disease, a new house and bowel cancer. The diagnosis of COPD, a flashy new car and a dose of penicillin to fix a skin rash.",
]
predict(to_predict)

Read More …

[ad_2]


Write a comment