diff --git a/info-extraction/Gemfile b/info-extraction/Gemfile index 3bc00e2..4f0a537 100644 --- a/info-extraction/Gemfile +++ b/info-extraction/Gemfile @@ -5,4 +5,4 @@ gem 'json' gem 'trollop' gem 'standoff' -# gem 'byebug' \ No newline at end of file +gem 'byebug' \ No newline at end of file diff --git a/info-extraction/Gemfile.lock b/info-extraction/Gemfile.lock index d8d7629..2e599ba 100644 --- a/info-extraction/Gemfile.lock +++ b/info-extraction/Gemfile.lock @@ -1,6 +1,7 @@ GEM remote: https://rubygems.org/ specs: + byebug (10.0.2) json (2.1.0) mustermann (1.0.2) rack (2.0.5) @@ -19,10 +20,11 @@ PLATFORMS ruby DEPENDENCIES + byebug json sinatra standoff trollop BUNDLED WITH - 1.16.1 + 1.16.4 diff --git a/info-extraction/config.ru b/info-extraction/config.ru new file mode 100644 index 0000000..1fac37d --- /dev/null +++ b/info-extraction/config.ru @@ -0,0 +1,3 @@ +$:.unshift(File.dirname(__FILE__)) +require 'server' +run Sinatra::Application \ No newline at end of file diff --git a/info-extraction/lib/chunker.rb b/info-extraction/lib/chunker.rb index c7e0419..6e06fe7 100644 --- a/info-extraction/lib/chunker.rb +++ b/info-extraction/lib/chunker.rb @@ -19,14 +19,8 @@ def run(document) /side effects?/i ] when :disease_status - [ - /status/i, - /progressing/i, - /stable/i, - /getting worse/i, - /worsening/i, - /improving/i, - /getting better/i + [ # TODO: this is dopey. duplication of search expressions between here and the status_value patterns in DiseaseStatusExtractor should be abstracted and merged. + /((not? )|(complete ))?(stable|progressing|responding|response( to treatment)?|resection|inevaluable|changed?|getting worse|worsening|getting better|improving)/i ] else raise "Unknown chunking target: #{target}" diff --git a/info-extraction/lib/disease_status_extractor.rb b/info-extraction/lib/disease_status_extractor.rb index 0290848..260214d 100644 --- a/info-extraction/lib/disease_status_extractor.rb +++ b/info-extraction/lib/disease_status_extractor.rb @@ -3,7 +3,7 @@ =begin This is a really naive information extractor to detect assertions about disease statuses. -All it does is tag mentions of a list of disease-status-related keywords (disease, cancer, status), tag mentions of disease-status-related value words (e.g. stable, improving). For each keyword, it checks the next tag to the right, and if that tag is a status value, it returns the value. +All it does is tag mentions of a list of disease-status-related keywords (disease, cancer, status), tag mentions of disease-status-related value words (e.g. stable, improving), and tag mentions of rationale concepts (e.g. imaging, scans, symptoms). For each keyword, it checks the next tag to the right, and if that tag is a status value, it returns the value. It also returns and normalizes all rationale mentions found within the chunk along with that status assertion. So, it will assert "stable" for the phrase "your cancer is stable." @@ -11,30 +11,114 @@ =end +=begin +Example output: +[ + { + "disease": null, + "status": { + "mention text": "not changing", + "normalized": "stable" + }, + "rationale": [ + { + "mention text": "CT scans", + "normalized": "Imaging" + } + ] + }, + { + "disease": null, + "status": { + "mention text": "progressing", + "normalized": "progressing" + }, + "rationale": [ + { + "mention text": "CT scans", + "normalized": "Imaging" + }, + { + "mention text": "physical exam", + "normalized": "Physical Exam" + } + ] + } +] +=end + class DiseaseStatusExtractor def analyze_text (text) annotated = Standoff::AnnotatedString.new( :signal => text, :tags => []) # this would be just key.match document, but we want MatchData for possible multiple keyword matches, not just one - annotated.signal.to_enum(:scan, /status|disease|cancer/).map{ Regexp.last_match }.each do |match| + annotated.signal.to_enum(:scan, /status|disease|cancer/i).map{ Regexp.last_match }.each do |match| annotated.tags << Standoff::Tag.new(:content => match[0], :name => "disease_status_key", :start => match.begin(0), :end => match.end(0) ) end - - annotated.signal.to_enum(:scan, /(not )?(stable|progressing|getting worse|worsening|getting better|improving)/).map{ Regexp.last_match }.each do |match| + + # TODO: this is dopey. duplication of search expressions between here and the disease_status patterns in Chunker should be abstracted and merged. + annotated.signal.to_enum(:scan, /((not? )|(complete ))?(stable|progressing|responding|response( to treatment)?|resection|inevaluable|changed?|getting worse|worsening|getting better|improving)/i).map{ Regexp.last_match }.each do |match| + mention_text = match[0] + mapped_for_normalization = case mention_text # be very careful with this. it evaluates greedily, and as such the order of expressions here matters a lot. + when /^not? /i + "stable" + when /getting worse|worsening|progress/i + "progressing" + when /complete resection/i, /complete response/i + $& + when /getting better|improving|response to treatment/i + "responding" + when /inevaluable/i, /stable/i, /progressing/i + $& + else + nil + end + normalized = mapped_for_normalization ? mapped_for_normalization.split(/ |\_/).map(&:capitalize).join(" ") : nil#cap each word annotated.tags << Standoff::Tag.new(:content => match[0], :name => "status_value", + :attributes => {:normalized => normalized}, :start => match.begin(0), :end => match.end(0) ) end + annotated.signal.to_enum(:scan, /((ca?t scan|mri|x-ray|x ray|imaging)( results)?)/i).map{ Regexp.last_match }.each do |match| + annotated.tags << Standoff::Tag.new(:content => match[0], + :name => "status_rationale", + :attributes => {:normalized => "Imaging"}, + :start => match.begin(0), + :end => match.end(0) ) + end + annotated.signal.to_enum(:scan, /(pathology|symptoms|(physical )?exam|markers)/i).map{ Regexp.last_match }.each do |match| + mention_text = m[0] + mention_text = "physical exam" if mention_text == "exam" + normalized = mention_text.split(/ |\_/).map(&:capitalize).join(" ") #cap each word + annotated.tags << Standoff::Tag.new(:content => mention_text, + :name => "status_rationale", + :attributes => {:normalized => normalized}, + :start => match.begin(0), + :end => match.end(0) ) + end + + + + disease_status_assertions = [] + annotated.tags.select{|tag| tag.name == "disease_status_key"}.each do |key_tag| next_tag = annotated.next_tag key_tag - if next_tag.name == "status_value" - disease_status_assertions << next_tag.content + if next_tag && (next_tag.name == "status_value") + disease_status_assertions << { + :disease => nil, + :rationale => annotated.tags.select{|tag| tag.name == "status_rationale"}.map do|tag| + {:mention_text => tag.content, :normalized => tag.attributes[:normalized]} + end.uniq, + :status => { :mention_text => next_tag.content, + :normalized => next_tag.attributes[:normalized]} + + } end end diff --git a/info-extraction/lib/fluxnotes_integration.rb b/info-extraction/lib/fluxnotes_integration.rb new file mode 100644 index 0000000..ef2c55b --- /dev/null +++ b/info-extraction/lib/fluxnotes_integration.rb @@ -0,0 +1,11 @@ +require 'json' +require 'byebug' +class FluxNotes + def self.build_structured_phrase(phrase, fields) + raise ArgumentError.new("Phrase needs to be of type 'String'") if(!phrase.instance_of? String) + raise ArgumentError.new("fields needs to be of type 'Array'") if(!fields.instance_of? Array) + fields_data = fields.map{ |field| "{name: '#{field[:name]}', value: '#{field[:value]}'}"} + data = "{phrase: #{phrase}, fields: [#{fields_data.join(", ")}]}" + return "flux_command('insert-structured-phrase', #{data})" + end +end \ No newline at end of file diff --git a/info-extraction/lib/watson4fluxnotes.rb b/info-extraction/lib/watson4fluxnotes.rb index e0d6b53..b38af3b 100644 --- a/info-extraction/lib/watson4fluxnotes.rb +++ b/info-extraction/lib/watson4fluxnotes.rb @@ -32,7 +32,6 @@ def filter_to_desired_categories (results_hash) # we only care about entities of type "HealthCondition" (though that may be expanded later) results_hash["entities"].select!{|e| e["type"] == "HealthCondition"} # and concepts of type "Disease" (though that may be expanded later) - #TODO: when we have real DBpedia type checking available, has_relevant_dbpedia_concept_type should be deleted and replaced results_hash["concepts"].select!{|c| has_relevant_dbpedia_concept_type c} results_hash end @@ -45,35 +44,16 @@ def has_relevant_dbpedia_concept_type ( concept ) "http://umbel.org/umbel/rc/AilmentCondition" ] - types = DBPedia.loadDBPediaDataType(concept['dbpedia_resource']) if types == nil return false end - # require 'byebug' - # byebug - - if (types.map{|t| t['value']} & types_we_care_about).length == 0 - return false - end - # this is a manually curated, example-specific list based on checking DBpedia pages. will be replaced with automated DBpedia queries. - concepts_known_not_to_be_disease = [ - "Chemotherapy", - "Pharmacology", - "2006 albums", - "2008 singles", - "HIV", - "Pain", # probably we want to catch this one. following the rules for now though. - "Pharmaceutical drug", - "Prescription drug" - ] - - if concepts_known_not_to_be_disease.include? concept["text"] - return false + if (types.map{|t| t['value']} & types_we_care_about).length > 0 + return true else - return true + return false end end diff --git a/info-extraction/server.rb b/info-extraction/server.rb index f838fa1..cafae05 100644 --- a/info-extraction/server.rb +++ b/info-extraction/server.rb @@ -1,12 +1,13 @@ require 'sinatra' require 'json' -# require 'byebug' +require 'byebug' require_relative "lib/watson4fluxnotes.rb" require_relative "lib/meddra4fluxnotes.rb" require_relative "lib/chunker.rb" require_relative "lib/findings_collector.rb" require_relative "lib/disease_status_extractor.rb" +require_relative "lib/fluxnotes_integration.rb" get '/' do @@ -33,12 +34,15 @@ end content_type :json flux_notes_messages = [] - diseaseResults.each do |res| - flux_notes_messages << "flux_command('insert-structured-phrase', {phrase:'disease status', fields: [{name:'status', value: '#{res[0]}'}]})" + diseaseResults.each do |res| + res.each do |concept| + flux_notes_messages << FluxNotes.build_structured_phrase('disease status', [{name: 'status', value: concept[:status][:normalized]}]) + end end toxicityResults.each do |tox| tox['concepts'].each do |concept| - flux_notes_messages << "flux_command('insert-structured-phrase', {phrase:'toxicity', fields: [{name:'adverseEvent', value: '#{concept['text']}'}]})" + flux_notes_messages << FluxNotes.build_structured_phrase('toxicity', [{name: 'adverseEvent', value: concept['text']}]) + # "flux_command('insert-structured-phrase', {phrase:'toxicity', fields: [{name:'adverseEvent', value: '#{concept['text']}'}]})" end end return { @@ -46,4 +50,4 @@ toxicity: toxicityResults, fluxCommands: flux_notes_messages.uniq }.to_json -end \ No newline at end of file +end