From 2cb78bd990b69760d0183d429dfc4b066866f9d2 Mon Sep 17 00:00:00 2001 From: David Tresner-Kirsch Date: Mon, 27 Aug 2018 12:47:02 -0400 Subject: [PATCH 1/8] add status rationale detection --- .../lib/disease_status_extractor.rb | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/info-extraction/lib/disease_status_extractor.rb b/info-extraction/lib/disease_status_extractor.rb index 0290848..011a69d 100644 --- a/info-extraction/lib/disease_status_extractor.rb +++ b/info-extraction/lib/disease_status_extractor.rb @@ -22,7 +22,7 @@ def analyze_text (text) :start => match.begin(0), :end => match.end(0) ) end - + annotated.signal.to_enum(:scan, /(not )?(stable|progressing|getting worse|worsening|getting better|improving)/).map{ Regexp.last_match }.each do |match| annotated.tags << Standoff::Tag.new(:content => match[0], :name => "status_value", @@ -30,11 +30,25 @@ def analyze_text (text) :end => match.end(0) ) end + annotated.signal.to_enum(:scan, /(CT scan|imaging|lab)( results)?/).map{ Regexp.last_match }.each do |match| + annotated.tags << Standoff::Tag.new(:content => match[0], + :name => "status_rationale", + :start => match.begin(0), + :end => match.end(0) ) + end + + + disease_status_assertions = [] + annotated.tags.select{|tag| tag.name == "disease_status_key"}.each do |key_tag| next_tag = annotated.next_tag key_tag - if next_tag.name == "status_value" - disease_status_assertions << next_tag.content + if next_tag && (next_tag.name == "status_value") + disease_status_assertions << { + :disease => nil, + :rationale => annotated.tags.select{|tag| tag.name == "status_rationale"}.map{|tag| tag.content}.uniq, + :status => next_tag.content + } end end From 9ea48d9aa08bbf92cc995886cd36fc947d337bb6 Mon Sep 17 00:00:00 2001 From: David Tresner-Kirsch Date: Mon, 27 Aug 2018 13:02:26 -0400 Subject: [PATCH 2/8] rely on whitelist of DB concept categories; no longer need blacklist of specific concept names --- info-extraction/lib/watson4fluxnotes.rb | 26 +++---------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/info-extraction/lib/watson4fluxnotes.rb b/info-extraction/lib/watson4fluxnotes.rb index e0d6b53..b38af3b 100644 --- a/info-extraction/lib/watson4fluxnotes.rb +++ b/info-extraction/lib/watson4fluxnotes.rb @@ -32,7 +32,6 @@ def filter_to_desired_categories (results_hash) # we only care about entities of type "HealthCondition" (though that may be expanded later) results_hash["entities"].select!{|e| e["type"] == "HealthCondition"} # and concepts of type "Disease" (though that may be expanded later) - #TODO: when we have real DBpedia type checking available, has_relevant_dbpedia_concept_type should be deleted and replaced results_hash["concepts"].select!{|c| has_relevant_dbpedia_concept_type c} results_hash end @@ -45,35 +44,16 @@ def has_relevant_dbpedia_concept_type ( concept ) "http://umbel.org/umbel/rc/AilmentCondition" ] - types = DBPedia.loadDBPediaDataType(concept['dbpedia_resource']) if types == nil return false end - # require 'byebug' - # byebug - - if (types.map{|t| t['value']} & types_we_care_about).length == 0 - return false - end - # this is a manually curated, example-specific list based on checking DBpedia pages. will be replaced with automated DBpedia queries. - concepts_known_not_to_be_disease = [ - "Chemotherapy", - "Pharmacology", - "2006 albums", - "2008 singles", - "HIV", - "Pain", # probably we want to catch this one. following the rules for now though. - "Pharmaceutical drug", - "Prescription drug" - ] - - if concepts_known_not_to_be_disease.include? concept["text"] - return false + if (types.map{|t| t['value']} & types_we_care_about).length > 0 + return true else - return true + return false end end From 8e1babc4f07d1951ccfe3892cb1f35fdc7ca4b56 Mon Sep 17 00:00:00 2001 From: David Tresner-Kirsch Date: Mon, 27 Aug 2018 13:08:00 -0400 Subject: [PATCH 3/8] fixing two things: 1) the previous version was only returning the first concept from an array of concepts detected in each chunk of text, rather than possible multiple findings per chunk; 2) disease status extractor now returns a key-value hash for each detected concept, so we should interpolate it as a json literal rather than a string --- info-extraction/server.rb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/info-extraction/server.rb b/info-extraction/server.rb index f838fa1..f0a5453 100644 --- a/info-extraction/server.rb +++ b/info-extraction/server.rb @@ -33,8 +33,10 @@ end content_type :json flux_notes_messages = [] - diseaseResults.each do |res| - flux_notes_messages << "flux_command('insert-structured-phrase', {phrase:'disease status', fields: [{name:'status', value: '#{res[0]}'}]})" + diseaseResults.each do |res| + res.each do |concept| + flux_notes_messages << "flux_command('insert-structured-phrase', {phrase:'disease status', fields: [{name:'status', value: #{concept.to_json}}]})" + end end toxicityResults.each do |tox| tox['concepts'].each do |concept| @@ -46,4 +48,4 @@ toxicity: toxicityResults, fluxCommands: flux_notes_messages.uniq }.to_json -end \ No newline at end of file +end From a6670a9ece2d6a6343b9a612de2a385bc512f7a3 Mon Sep 17 00:00:00 2001 From: David Tresner-Kirsch Date: Mon, 27 Aug 2018 13:18:46 -0400 Subject: [PATCH 4/8] expand rationale keywords --- info-extraction/lib/disease_status_extractor.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/info-extraction/lib/disease_status_extractor.rb b/info-extraction/lib/disease_status_extractor.rb index 011a69d..3f5a12e 100644 --- a/info-extraction/lib/disease_status_extractor.rb +++ b/info-extraction/lib/disease_status_extractor.rb @@ -30,7 +30,7 @@ def analyze_text (text) :end => match.end(0) ) end - annotated.signal.to_enum(:scan, /(CT scan|imaging|lab)( results)?/).map{ Regexp.last_match }.each do |match| + annotated.signal.to_enum(:scan, /((CT scan|imaging|lab|pathology)( results)?)|symptoms?|exam|markers/).map{ Regexp.last_match }.each do |match| annotated.tags << Standoff::Tag.new(:content => match[0], :name => "status_rationale", :start => match.begin(0), From 6ed7437e25f9c58c6be6ae7799f9e957c40a403f Mon Sep 17 00:00:00 2001 From: David Tresner-Kirsch Date: Tue, 4 Sep 2018 17:00:52 -0400 Subject: [PATCH 5/8] detect more rationale expressions; normalize both rationale assertions and status assertions; new data schema for output --- info-extraction/lib/chunker.rb | 10 +-- .../lib/disease_status_extractor.rb | 80 +++++++++++++++++-- 2 files changed, 77 insertions(+), 13 deletions(-) diff --git a/info-extraction/lib/chunker.rb b/info-extraction/lib/chunker.rb index c7e0419..6e06fe7 100644 --- a/info-extraction/lib/chunker.rb +++ b/info-extraction/lib/chunker.rb @@ -19,14 +19,8 @@ def run(document) /side effects?/i ] when :disease_status - [ - /status/i, - /progressing/i, - /stable/i, - /getting worse/i, - /worsening/i, - /improving/i, - /getting better/i + [ # TODO: this is dopey. duplication of search expressions between here and the status_value patterns in DiseaseStatusExtractor should be abstracted and merged. + /((not? )|(complete ))?(stable|progressing|responding|response( to treatment)?|resection|inevaluable|changed?|getting worse|worsening|getting better|improving)/i ] else raise "Unknown chunking target: #{target}" diff --git a/info-extraction/lib/disease_status_extractor.rb b/info-extraction/lib/disease_status_extractor.rb index 3f5a12e..b7f55d0 100644 --- a/info-extraction/lib/disease_status_extractor.rb +++ b/info-extraction/lib/disease_status_extractor.rb @@ -11,31 +11,97 @@ =end +=begin +Example output: +[ + { + "disease": null, + "status": { + "mention text": "not changing", + "normalized": "stable" + }, + "rationale": [ + { + "mention text": "CT scans", + "normalized": "Imaging" + } + ] + }, + { + "disease": null, + "status": { + "mention text": "progressing", + "normalized": "progressing" + }, + "rationale": [ + { + "mention text": "CT scans", + "normalized": "Imaging" + }, + { + "mention text": "physical exam", + "normalized": "Physical Exam" + } + ] + } +] +=end + class DiseaseStatusExtractor def analyze_text (text) annotated = Standoff::AnnotatedString.new( :signal => text, :tags => []) # this would be just key.match document, but we want MatchData for possible multiple keyword matches, not just one - annotated.signal.to_enum(:scan, /status|disease|cancer/).map{ Regexp.last_match }.each do |match| + annotated.signal.to_enum(:scan, /status|disease|cancer/i).map{ Regexp.last_match }.each do |match| annotated.tags << Standoff::Tag.new(:content => match[0], :name => "disease_status_key", :start => match.begin(0), :end => match.end(0) ) end - annotated.signal.to_enum(:scan, /(not )?(stable|progressing|getting worse|worsening|getting better|improving)/).map{ Regexp.last_match }.each do |match| + # TODO: this is dopey. duplication of search expressions between here and the disease_status patterns in Chunker should be abstracted and merged. + annotated.signal.to_enum(:scan, /((not? )|(complete ))?(stable|progressing|responding|response( to treatment)?|resection|inevaluable|changed?|getting worse|worsening|getting better|improving)/i).map{ Regexp.last_match }.each do |match| + mention_text = match[0] + mapped_for_normalization = case mention_text # be very careful with this. it evaluates greedily, and as such the order of expressions here matters a lot. + when /^not? /i + "stable" + when /getting worse|worsening|progress/i + "progressing" + when /complete resection/i, /complete response/i + $& + when /getting better|improving|response to treatment/i + "responding" + when /inevaluable/i, /stable/i, /progressing/i + $& + else + nil + end + normalized = mapped_for_normalization.split(/ |\_/).map(&:capitalize).join(" ") #cap each word annotated.tags << Standoff::Tag.new(:content => match[0], :name => "status_value", + :attributes => {:normalized => normalized}, :start => match.begin(0), :end => match.end(0) ) end - annotated.signal.to_enum(:scan, /((CT scan|imaging|lab|pathology)( results)?)|symptoms?|exam|markers/).map{ Regexp.last_match }.each do |match| + annotated.signal.to_enum(:scan, /((ca?t scan|mri|x-ray|x ray|imaging)( results)?)/i).map{ Regexp.last_match }.each do |match| annotated.tags << Standoff::Tag.new(:content => match[0], :name => "status_rationale", + :attributes => {:normalized => "Imaging"}, + :start => match.begin(0), + :end => match.end(0) ) + end + annotated.signal.to_enum(:scan, /(pathology|symptoms|(physical )?exam|markers)/i).map{ Regexp.last_match }.each do |match| + mention_text = m[0] + mention_text = "physical exam" if mention_text == "exam" + normalized = mention_text.split(/ |\_/).map(&:capitalize).join(" ") #cap each word + annotated.tags << Standoff::Tag.new(:content => mention_text, + :name => "status_rationale", + :attributes => {:normalized => normalized}, :start => match.begin(0), :end => match.end(0) ) end + @@ -46,8 +112,12 @@ def analyze_text (text) if next_tag && (next_tag.name == "status_value") disease_status_assertions << { :disease => nil, - :rationale => annotated.tags.select{|tag| tag.name == "status_rationale"}.map{|tag| tag.content}.uniq, - :status => next_tag.content + :rationale => annotated.tags.select{|tag| tag.name == "status_rationale"}.map do|tag| + {:mention_text => tag.content, :normalized => tag.attributes[:normalized]} + end.uniq, + :status => { :mention_text => next_tag.content, + :normalized => next_tag.attributes[:normalized]} + } end end From 889aece2981b0be55c1b7ca6ab9d559c83978c5d Mon Sep 17 00:00:00 2001 From: David Tresner-Kirsch Date: Tue, 11 Sep 2018 10:17:52 -0400 Subject: [PATCH 6/8] bug fix --- info-extraction/lib/disease_status_extractor.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/info-extraction/lib/disease_status_extractor.rb b/info-extraction/lib/disease_status_extractor.rb index b7f55d0..7a4969b 100644 --- a/info-extraction/lib/disease_status_extractor.rb +++ b/info-extraction/lib/disease_status_extractor.rb @@ -76,7 +76,7 @@ def analyze_text (text) else nil end - normalized = mapped_for_normalization.split(/ |\_/).map(&:capitalize).join(" ") #cap each word + normalized = mapped_for_normalization ? mapped_for_normalization.split(/ |\_/).map(&:capitalize).join(" ") : nil#cap each word annotated.tags << Standoff::Tag.new(:content => match[0], :name => "status_value", :attributes => {:normalized => normalized}, From 57c5d492f2ce8aa9e9b88f6c1d4a2c044f772170 Mon Sep 17 00:00:00 2001 From: David Tresner-Kirsch Date: Thu, 13 Sep 2018 13:01:15 -0400 Subject: [PATCH 7/8] update documentation --- info-extraction/lib/disease_status_extractor.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/info-extraction/lib/disease_status_extractor.rb b/info-extraction/lib/disease_status_extractor.rb index 7a4969b..260214d 100644 --- a/info-extraction/lib/disease_status_extractor.rb +++ b/info-extraction/lib/disease_status_extractor.rb @@ -3,7 +3,7 @@ =begin This is a really naive information extractor to detect assertions about disease statuses. -All it does is tag mentions of a list of disease-status-related keywords (disease, cancer, status), tag mentions of disease-status-related value words (e.g. stable, improving). For each keyword, it checks the next tag to the right, and if that tag is a status value, it returns the value. +All it does is tag mentions of a list of disease-status-related keywords (disease, cancer, status), tag mentions of disease-status-related value words (e.g. stable, improving), and tag mentions of rationale concepts (e.g. imaging, scans, symptoms). For each keyword, it checks the next tag to the right, and if that tag is a status value, it returns the value. It also returns and normalizes all rationale mentions found within the chunk along with that status assertion. So, it will assert "stable" for the phrase "your cancer is stable." From 7390d5c2061e8d93ec3a58f41aeb5c17cdc23cee Mon Sep 17 00:00:00 2001 From: Andrew Schreiber Date: Mon, 17 Sep 2018 13:52:02 -0400 Subject: [PATCH 8/8] Clean up FN integration --- info-extraction/Gemfile | 2 +- info-extraction/Gemfile.lock | 4 +++- info-extraction/config.ru | 3 +++ info-extraction/lib/fluxnotes_integration.rb | 11 +++++++++++ info-extraction/server.rb | 8 +++++--- 5 files changed, 23 insertions(+), 5 deletions(-) create mode 100644 info-extraction/config.ru create mode 100644 info-extraction/lib/fluxnotes_integration.rb diff --git a/info-extraction/Gemfile b/info-extraction/Gemfile index 3bc00e2..4f0a537 100644 --- a/info-extraction/Gemfile +++ b/info-extraction/Gemfile @@ -5,4 +5,4 @@ gem 'json' gem 'trollop' gem 'standoff' -# gem 'byebug' \ No newline at end of file +gem 'byebug' \ No newline at end of file diff --git a/info-extraction/Gemfile.lock b/info-extraction/Gemfile.lock index d8d7629..2e599ba 100644 --- a/info-extraction/Gemfile.lock +++ b/info-extraction/Gemfile.lock @@ -1,6 +1,7 @@ GEM remote: https://rubygems.org/ specs: + byebug (10.0.2) json (2.1.0) mustermann (1.0.2) rack (2.0.5) @@ -19,10 +20,11 @@ PLATFORMS ruby DEPENDENCIES + byebug json sinatra standoff trollop BUNDLED WITH - 1.16.1 + 1.16.4 diff --git a/info-extraction/config.ru b/info-extraction/config.ru new file mode 100644 index 0000000..1fac37d --- /dev/null +++ b/info-extraction/config.ru @@ -0,0 +1,3 @@ +$:.unshift(File.dirname(__FILE__)) +require 'server' +run Sinatra::Application \ No newline at end of file diff --git a/info-extraction/lib/fluxnotes_integration.rb b/info-extraction/lib/fluxnotes_integration.rb new file mode 100644 index 0000000..ef2c55b --- /dev/null +++ b/info-extraction/lib/fluxnotes_integration.rb @@ -0,0 +1,11 @@ +require 'json' +require 'byebug' +class FluxNotes + def self.build_structured_phrase(phrase, fields) + raise ArgumentError.new("Phrase needs to be of type 'String'") if(!phrase.instance_of? String) + raise ArgumentError.new("fields needs to be of type 'Array'") if(!fields.instance_of? Array) + fields_data = fields.map{ |field| "{name: '#{field[:name]}', value: '#{field[:value]}'}"} + data = "{phrase: #{phrase}, fields: [#{fields_data.join(", ")}]}" + return "flux_command('insert-structured-phrase', #{data})" + end +end \ No newline at end of file diff --git a/info-extraction/server.rb b/info-extraction/server.rb index f0a5453..cafae05 100644 --- a/info-extraction/server.rb +++ b/info-extraction/server.rb @@ -1,12 +1,13 @@ require 'sinatra' require 'json' -# require 'byebug' +require 'byebug' require_relative "lib/watson4fluxnotes.rb" require_relative "lib/meddra4fluxnotes.rb" require_relative "lib/chunker.rb" require_relative "lib/findings_collector.rb" require_relative "lib/disease_status_extractor.rb" +require_relative "lib/fluxnotes_integration.rb" get '/' do @@ -35,12 +36,13 @@ flux_notes_messages = [] diseaseResults.each do |res| res.each do |concept| - flux_notes_messages << "flux_command('insert-structured-phrase', {phrase:'disease status', fields: [{name:'status', value: #{concept.to_json}}]})" + flux_notes_messages << FluxNotes.build_structured_phrase('disease status', [{name: 'status', value: concept[:status][:normalized]}]) end end toxicityResults.each do |tox| tox['concepts'].each do |concept| - flux_notes_messages << "flux_command('insert-structured-phrase', {phrase:'toxicity', fields: [{name:'adverseEvent', value: '#{concept['text']}'}]})" + flux_notes_messages << FluxNotes.build_structured_phrase('toxicity', [{name: 'adverseEvent', value: concept['text']}]) + # "flux_command('insert-structured-phrase', {phrase:'toxicity', fields: [{name:'adverseEvent', value: '#{concept['text']}'}]})" end end return {