FluxNotes · dtkirsch · Aug 27, 2018 · Aug 27, 2018 · Aug 27, 2018 · Aug 27, 2018
diff --git a/info-extraction/Gemfile b/info-extraction/Gemfile
@@ -5,4 +5,4 @@ gem 'json'
 gem 'trollop'
 gem 'standoff'
 
-# gem 'byebug'
+gem 'byebug'
diff --git a/info-extraction/Gemfile.lock b/info-extraction/Gemfile.lock
@@ -1,6 +1,7 @@
 GEM
   remote: https://rubygems.org/
   specs:
+    byebug (10.0.2)
     json (2.1.0)
     mustermann (1.0.2)
     rack (2.0.5)
@@ -19,10 +20,11 @@ PLATFORMS
   ruby
 
 DEPENDENCIES
+  byebug
   json
   sinatra
   standoff
   trollop
 
 BUNDLED WITH
-   1.16.1
+   1.16.4
diff --git a/info-extraction/config.ru b/info-extraction/config.ru
@@ -0,0 +1,3 @@
+$:.unshift(File.dirname(__FILE__))
+require 'server'
+run Sinatra::Application
diff --git a/info-extraction/lib/chunker.rb b/info-extraction/lib/chunker.rb
@@ -19,14 +19,8 @@ def run(document)
                             /side effects?/i
                           ]
                         when :disease_status
-                          [
-                            /status/i,
-                            /progressing/i,
-                            /stable/i,
-                            /getting worse/i,
-                            /worsening/i,
-                            /improving/i,
-                            /getting better/i
+                          [ # TODO: this is dopey. duplication of search expressions between here and the status_value patterns in DiseaseStatusExtractor should be abstracted and merged.
+                            /((not? )|(complete ))?(stable|progressing|responding|response( to treatment)?|resection|inevaluable|changed?|getting worse|worsening|getting better|improving)/i
                           ]
                         else
                           raise "Unknown chunking target: #{target}"

diff --git a/info-extraction/lib/disease_status_extractor.rb b/info-extraction/lib/disease_status_extractor.rb
@@ -3,38 +3,122 @@
 =begin
 This is a really naive information extractor to detect assertions about disease statuses.
 
-All it does is tag mentions of a list of disease-status-related keywords (disease, cancer, status), tag mentions of disease-status-related value words (e.g. stable, improving). For each keyword, it checks the next tag to the right, and if that tag is a status value, it returns the value.
+All it does is tag mentions of a list of disease-status-related keywords (disease, cancer, status), tag mentions of disease-status-related value words (e.g. stable, improving), and tag mentions of rationale concepts (e.g. imaging, scans, symptoms). For each keyword, it checks the next tag to the right, and if that tag is a status value, it returns the value. It also returns and normalizes all rationale mentions found within the chunk along with that status assertion.
 
 So, it will assert "stable" for the phrase "your cancer is stable."
 
 It will also do the right thing in slightly more complicated cases, for example, still correctly asserting "stable" and also correctly NOT assert "progressing" for the phrase "your disease is um stable it is not progressing."
 
 =end
 
+=begin
+Example output:
+[
+    {
+	"disease": null,
+	"status": {
+	    "mention text": "not changing",
+	    "normalized": "stable"
+	},
+	"rationale": [
+	    {
+		"mention text": "CT scans",
+		"normalized": "Imaging"
+	    }
+	]
+    },
+    {
+	"disease": null,
+	"status": {
+	    "mention text": "progressing",
+	    "normalized": "progressing"
+	},
+	"rationale": [
+	    {
+		"mention text": "CT scans",
+		"normalized": "Imaging"
+	    },
+	    {
+		"mention text": "physical exam",
+		"normalized": "Physical Exam"
+	    }
+	]
+    }
+]
+=end
+
 class DiseaseStatusExtractor
   def analyze_text (text)
     annotated = Standoff::AnnotatedString.new( :signal => text, :tags => [])
 
     # this would be just key.match document, but we want MatchData for possible multiple keyword matches, not just one
-    annotated.signal.to_enum(:scan, /status|disease|cancer/).map{ Regexp.last_match }.each do |match|
+    annotated.signal.to_enum(:scan, /status|disease|cancer/i).map{ Regexp.last_match }.each do |match|
       annotated.tags << Standoff::Tag.new(:content => match[0],
                                           :name => "disease_status_key",
                                           :start => match.begin(0),
                                           :end => match.end(0) )
     end
-
-    annotated.signal.to_enum(:scan, /(not )?(stable|progressing|getting worse|worsening|getting better|improving)/).map{ Regexp.last_match }.each do |match|
+
+    # TODO: this is dopey. duplication of search expressions between here and the disease_status patterns in Chunker should be abstracted and merged.
+    annotated.signal.to_enum(:scan, /((not? )|(complete ))?(stable|progressing|responding|response( to treatment)?|resection|inevaluable|changed?|getting worse|worsening|getting better|improving)/i).map{ Regexp.last_match }.each do |match|
+      mention_text = match[0]
+      mapped_for_normalization = case mention_text # be very careful with this. it evaluates greedily, and as such the order of expressions here matters a lot.
+                                 when /^not? /i
+                                   "stable"
+                                 when /getting worse|worsening|progress/i
+                                   "progressing"
+                                 when /complete resection/i, /complete response/i
+                                   $&
+                                 when /getting better|improving|response to treatment/i
+                                   "responding"
+                                 when /inevaluable/i, /stable/i, /progressing/i
+                                   $&
+                                 else
+                                   nil
+                                 end
+      normalized = mapped_for_normalization ? mapped_for_normalization.split(/ |\_/).map(&:capitalize).join(" ") : nil#cap each word
       annotated.tags << Standoff::Tag.new(:content => match[0],
                                           :name => "status_value",
+                                          :attributes => {:normalized => normalized},
                                           :start => match.begin(0),
                                           :end => match.end(0) )
     end
 
+    annotated.signal.to_enum(:scan, /((ca?t scan|mri|x-ray|x ray|imaging)( results)?)/i).map{ Regexp.last_match }.each do |match|
+      annotated.tags << Standoff::Tag.new(:content => match[0],
+                                          :name => "status_rationale",
+                                          :attributes => {:normalized => "Imaging"},
+                                          :start => match.begin(0),
+                                          :end => match.end(0) )
+    end
+    annotated.signal.to_enum(:scan, /(pathology|symptoms|(physical )?exam|markers)/i).map{ Regexp.last_match }.each do |match|
+      mention_text = m[0]
+      mention_text = "physical exam" if mention_text == "exam"
+      normalized = mention_text.split(/ |\_/).map(&:capitalize).join(" ") #cap each word
+      annotated.tags << Standoff::Tag.new(:content => mention_text,
+                                          :name => "status_rationale",
+                                          :attributes => {:normalized => normalized},
+                                          :start => match.begin(0),
+                                          :end => match.end(0) )
+    end
+
+
+
+
     disease_status_assertions = []
+
     annotated.tags.select{|tag| tag.name == "disease_status_key"}.each do |key_tag|
       next_tag = annotated.next_tag key_tag
-      if next_tag.name == "status_value"
-        disease_status_assertions << next_tag.content
+      if next_tag && (next_tag.name == "status_value")
+        disease_status_assertions << {
+          :disease => nil,
+          :rationale => annotated.tags.select{|tag| tag.name == "status_rationale"}.map do|tag|
+            {:mention_text => tag.content, :normalized => tag.attributes[:normalized]}
+          end.uniq,
+          :status => { :mention_text => next_tag.content,
+                       :normalized => next_tag.attributes[:normalized]}
+
+        }
       end
     end
 

diff --git a/info-extraction/lib/fluxnotes_integration.rb b/info-extraction/lib/fluxnotes_integration.rb
@@ -0,0 +1,11 @@
+require 'json'
+require 'byebug'
+class FluxNotes
+    def self.build_structured_phrase(phrase, fields) 
+        raise ArgumentError.new("Phrase needs to be of type 'String'") if(!phrase.instance_of? String)     
+        raise ArgumentError.new("fields needs to be of type 'Array'") if(!fields.instance_of? Array)
+        fields_data = fields.map{ |field| "{name: '#{field[:name]}', value: '#{field[:value]}'}"}
+        data = "{phrase: #{phrase}, fields: [#{fields_data.join(", ")}]}"
+        return "flux_command('insert-structured-phrase', #{data})"
+    end
+end
diff --git a/info-extraction/lib/watson4fluxnotes.rb b/info-extraction/lib/watson4fluxnotes.rb
@@ -32,7 +32,6 @@ def filter_to_desired_categories (results_hash)
     # we only care about entities of type "HealthCondition" (though that may be expanded later)
     results_hash["entities"].select!{|e| e["type"] == "HealthCondition"}
     # and concepts of type "Disease" (though that may be expanded later)
-    #TODO: when we have real DBpedia type checking available, has_relevant_dbpedia_concept_type should be deleted and replaced
     results_hash["concepts"].select!{|c| has_relevant_dbpedia_concept_type c}
     results_hash
   end
@@ -45,35 +44,16 @@ def has_relevant_dbpedia_concept_type ( concept )
       "http://umbel.org/umbel/rc/AilmentCondition"
     ]
 
-
     types = DBPedia.loadDBPediaDataType(concept['dbpedia_resource'])
 
     if types == nil 
       return false
     end
-    # require 'byebug'
-    #     byebug
-
-    if (types.map{|t| t['value']} & types_we_care_about).length == 0
-       return false
-    end
 
-    # this is a manually curated, example-specific list based on checking DBpedia pages. will be replaced with automated DBpedia queries.
-    concepts_known_not_to_be_disease = [
-      "Chemotherapy",
-      "Pharmacology",
-      "2006 albums",
-      "2008 singles",
-      "HIV",
-      "Pain", # probably we want to catch this one. following the rules for now though.
-      "Pharmaceutical drug",
-      "Prescription drug"
-    ]
-
-    if concepts_known_not_to_be_disease.include? concept["text"]
-      return false
+    if (types.map{|t| t['value']} & types_we_care_about).length > 0
+       return true
     else
-      return true
+      return false
     end
   end
 

diff --git a/info-extraction/server.rb b/info-extraction/server.rb
@@ -1,12 +1,13 @@
 require 'sinatra'
 require 'json'
-# require 'byebug'
+require 'byebug'
 
 require_relative "lib/watson4fluxnotes.rb"
 require_relative "lib/meddra4fluxnotes.rb"
 require_relative "lib/chunker.rb"
 require_relative "lib/findings_collector.rb"
 require_relative "lib/disease_status_extractor.rb"
+require_relative "lib/fluxnotes_integration.rb"
 
 
 get '/' do 
@@ -33,17 +34,20 @@
     end
     content_type :json
     flux_notes_messages = []
-    diseaseResults.each do |res| 
-        flux_notes_messages << "flux_command('insert-structured-phrase', {phrase:'disease status', fields: [{name:'status', value: '#{res[0]}'}]})"
+    diseaseResults.each do |res|
+        res.each do |concept|
+          flux_notes_messages << FluxNotes.build_structured_phrase('disease status', [{name: 'status', value: concept[:status][:normalized]}])
+        end
     end
     toxicityResults.each do |tox| 
         tox['concepts'].each do |concept| 
-            flux_notes_messages << "flux_command('insert-structured-phrase', {phrase:'toxicity', fields: [{name:'adverseEvent', value: '#{concept['text']}'}]})"
+            flux_notes_messages << FluxNotes.build_structured_phrase('toxicity', [{name: 'adverseEvent', value: concept['text']}])
+            # "flux_command('insert-structured-phrase', {phrase:'toxicity', fields: [{name:'adverseEvent', value: '#{concept['text']}'}]})"
         end
     end
     return {
         diseaseStatus: diseaseResults,
         toxicity: toxicityResults,
         fluxCommands: flux_notes_messages.uniq
     }.to_json
-end
+end