diff --git a/android/gradle/libs.versions.toml b/android/gradle/libs.versions.toml index 97e53abece..6e1518bb33 100644 --- a/android/gradle/libs.versions.toml +++ b/android/gradle/libs.versions.toml @@ -32,6 +32,7 @@ fhir-sdk-engine = "1.1.0-preview2-SNAPSHOT" fhir-sdk-knowledge = "0.1.0-alpha03-preview5-rc2-SNAPSHOT" fhir-sdk-workflow = "0.1.0-alpha04-preview10-rc1-SNAPSHOT" fragment-ktx = "1.8.3" +generativeai = "0.9.0" glide = "4.16.0" googleCloudSpeech = "2.5.2" gradle = "8.3.2" @@ -129,6 +130,7 @@ fhir-sdk-common = { group = "org.smartregister", name = "common", version.ref = foundation = { group = "androidx.compose.foundation", name = "foundation", version.ref = "compose-ui" } fragment-ktx = { group = "androidx.fragment", name = "fragment-ktx", version.ref = "fragment-ktx" } fragment-testing = { group = "androidx.fragment", name = "fragment-testing", version.ref = "fragment-ktx" } +generativeai = { module = "com.google.ai.client.generativeai:generativeai", version.ref = "generativeai" } glide = { group = "com.github.bumptech.glide", name = "glide", version.ref = "glide" } gms-play-services-location = { group = "com.google.android.gms", name = "play-services-location", version.ref = "playServicesLocation" } google-cloud-speech = { module = "com.google.cloud:google-cloud-speech", version.ref = "googleCloudSpeech" } diff --git a/android/quest/build.gradle.kts b/android/quest/build.gradle.kts index 1cfa8a8413..6d842e494a 100644 --- a/android/quest/build.gradle.kts +++ b/android/quest/build.gradle.kts @@ -488,7 +488,10 @@ dependencies { implementation(libs.androidx.fragment.compose) implementation(libs.bundles.cameraX) implementation(libs.log4j) + + // AI dependencies implementation(libs.google.cloud.speech) + implementation(libs.generativeai) // Annotation processors kapt(libs.hilt.compiler) diff --git a/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/GeminiClient.kt b/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/GeminiClient.kt index 87dcd22aaf..fe2ab90905 100644 --- a/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/GeminiClient.kt +++ b/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/GeminiClient.kt @@ -1,28 +1,49 @@ +/* + * Copyright 2021-2024 Ona Systems, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.smartregister.fhircore.quest.ui.speechtoform -import okhttp3.* -import okhttp3.MediaType.Companion.toMediaType -import org.json.JSONObject -import java.io.IOException +import com.google.ai.client.generativeai.GenerativeModel +import com.google.ai.client.generativeai.type.BlockThreshold +import com.google.ai.client.generativeai.type.HarmCategory +import com.google.ai.client.generativeai.type.RequestOptions +import com.google.ai.client.generativeai.type.SafetySetting +import com.google.ai.client.generativeai.type.generationConfig class GeminiClient(private val apiKey: String) { - private val client = OkHttpClient() - private val baseUrl = "https://generativeai.googleapis.com/v1/models/gemini-1.5-flash:generateContent" - - fun generateContent(prompt: String): String { - val requestBody = JSONObject().put("prompt", prompt).toString() - - val request = Request.Builder() - .url(baseUrl) - .addHeader("Authorization", "Bearer $apiKey") - .post(RequestBody.create("application/json".toMediaType(), requestBody)) - .build() - - client.newCall(request).execute().use { response -> - if (!response.isSuccessful) throw IOException("Unexpected code $response") - - val jsonResponse = JSONObject(response.body?.string() ?: "") - return jsonResponse.getString("text") - } - } + // model usage + // https://developer.android.com/ai/google-ai-client-sdk + private val model = + GenerativeModel( + modelName = "gemini-1.5-flash-001", + // todo actually add the API key + apiKey = "BuildConfig.apikey", + generationConfig = + generationConfig { + temperature = 0.15f + topK = 32 + topP = 1f + maxOutputTokens = 4096 + }, + safetySettings = + listOf( + SafetySetting(HarmCategory.HARASSMENT, BlockThreshold.MEDIUM_AND_ABOVE), + SafetySetting(HarmCategory.HATE_SPEECH, BlockThreshold.MEDIUM_AND_ABOVE), + SafetySetting(HarmCategory.SEXUALLY_EXPLICIT, BlockThreshold.MEDIUM_AND_ABOVE), + SafetySetting(HarmCategory.DANGEROUS_CONTENT, BlockThreshold.MEDIUM_AND_ABOVE), + ), + ) } diff --git a/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/SpeechToForm.kt b/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/SpeechToForm.kt index a75d22a06d..6029277b47 100644 --- a/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/SpeechToForm.kt +++ b/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/SpeechToForm.kt @@ -1,43 +1,63 @@ +/* + * Copyright 2021-2024 Ona Systems, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.smartregister.fhircore.quest.ui.speechtoform -import org.hl7.fhir.r4.model.Questionnaire -import org.hl7.fhir.r4.model.QuestionnaireResponse import java.io.File import java.util.logging.Logger +import org.hl7.fhir.r4.model.Questionnaire +import org.hl7.fhir.r4.model.QuestionnaireResponse class SpeechToForm( - private val speechToText: SpeechToText, - private val textToForm: TextToForm + private val speechToText: SpeechToText, + private val textToForm: TextToForm, ) { - private val logger = Logger.getLogger(SpeechToForm::class.java.name) - - /** - * Reads an audio file, transcribes it, and generates a FHIR QuestionnaireResponse. - * - * @param audioFile The input audio file to process. - * @param questionnaire The FHIR Questionnaire used to generate the response. - * @return The generated QuestionnaireResponse, or null if the process fails. - */ - fun processAudioToQuestionnaireResponse(audioFile: File, questionnaire: Questionnaire): QuestionnaireResponse? { - logger.info("Starting audio transcription process...") - - // Step 1: Transcribe audio to text - val tempTextFile = speechToText.transcribeAudioToText(audioFile) - if (tempTextFile == null) { - logger.severe("Failed to transcribe audio.") - return null - } - logger.info("Transcription successful. File path: ${tempTextFile.absolutePath}") - - // Step 2: Generate QuestionnaireResponse from the transcript - val questionnaireResponse = textToForm.generateQuestionnaireResponse(tempTextFile, questionnaire) - if (questionnaireResponse == null) { - logger.severe("Failed to generate QuestionnaireResponse.") - return null - } - - logger.info("QuestionnaireResponse generated successfully.") - return questionnaireResponse + private val logger = Logger.getLogger(SpeechToForm::class.java.name) + + /** + * Reads an audio file, transcribes it, and generates a FHIR QuestionnaireResponse. + * + * @param audioFile The input audio file to process. + * @param questionnaire The FHIR Questionnaire used to generate the response. + * @return The generated QuestionnaireResponse, or null if the process fails. + */ + fun processAudioToQuestionnaireResponse( + audioFile: File, + questionnaire: Questionnaire, + ): QuestionnaireResponse? { + logger.info("Starting audio transcription process...") + + // Step 1: Transcribe audio to text + val tempTextFile = speechToText.transcribeAudioToText(audioFile) + if (tempTextFile == null) { + logger.severe("Failed to transcribe audio.") + return null } + logger.info("Transcription successful. File path: ${tempTextFile.absolutePath}") + + // Step 2: Generate QuestionnaireResponse from the transcript + val questionnaireResponse = + textToForm.generateQuestionnaireResponse(tempTextFile, questionnaire) + if (questionnaireResponse == null) { + logger.severe("Failed to generate QuestionnaireResponse.") + return null + } + + logger.info("QuestionnaireResponse generated successfully.") + return questionnaireResponse + } } diff --git a/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/SpeechToText.kt b/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/SpeechToText.kt index 1e6af65ca3..6a3d3760bb 100644 --- a/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/SpeechToText.kt +++ b/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/SpeechToText.kt @@ -1,57 +1,75 @@ +/* + * Copyright 2021-2024 Ona Systems, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.smartregister.fhircore.quest.ui.speechtoform import com.google.cloud.speech.v1.RecognitionAudio import com.google.cloud.speech.v1.RecognitionConfig +import com.google.cloud.speech.v1.RecognitionConfig.AudioEncoding import com.google.cloud.speech.v1.SpeechClient import com.google.cloud.speech.v1.SpeechRecognitionResult -import com.google.cloud.speech.v1.RecognitionConfig.AudioEncoding import java.io.File import java.util.logging.Logger class SpeechToText { - private val logger = Logger.getLogger(SpeechToText::class.java.name) - - /** - * Transcribes an audio file to text using Google Cloud Speech-to-Text API and writes it to a - * temporary file. - * - * @param audioFile The audio file to be transcribed. - * @return The temporary file containing the transcribed text. - */ - fun transcribeAudioToText(audioFile: File): File? { - var tempFile: File? = null - - SpeechClient.create().use { speechClient -> - val audioBytes = audioFile.readBytes() - - // Build the recognition audio - val recognitionAudio = RecognitionAudio.newBuilder() - .setContent(com.google.protobuf.ByteString.copyFrom(audioBytes)) - .build() - - // Configure recognition settings - val config = RecognitionConfig.newBuilder() - .setEncoding(AudioEncoding.LINEAR16) - .setSampleRateHertz(16000) - .setLanguageCode("en-US") - .build() - - // Perform transcription - val response = speechClient.recognize(config, recognitionAudio) - val transcription = response.resultsList.joinToString(" ") { - result: SpeechRecognitionResult -> - result.alternativesList[0].transcript - } - - logger.info("Transcription: $transcription") - - // Write transcription to a temporary file - tempFile = File.createTempFile("transcription", ".txt") - tempFile?.writeText(transcription) - - logger.info("Transcription written to temporary file. ") + private val logger = Logger.getLogger(SpeechToText::class.java.name) + + /** + * Transcribes an audio file to text using Google Cloud Speech-to-Text API and writes it to a + * temporary file. + * + * @param audioFile The audio file to be transcribed. + * @return The temporary file containing the transcribed text. + */ + fun transcribeAudioToText(audioFile: File): File? { + var tempFile: File? = null + + SpeechClient.create().use { speechClient -> + val audioBytes = audioFile.readBytes() + + // Build the recognition audio + val recognitionAudio = + RecognitionAudio.newBuilder() + .setContent(com.google.protobuf.ByteString.copyFrom(audioBytes)) + .build() + + // Configure recognition settings + val config = + RecognitionConfig.newBuilder() + .setEncoding(AudioEncoding.LINEAR16) + .setSampleRateHertz(16000) + .setLanguageCode("en-US") + .build() + + // Perform transcription + val response = speechClient.recognize(config, recognitionAudio) + val transcription = + response.resultsList.joinToString(" ") { result: SpeechRecognitionResult -> + result.alternativesList[0].transcript } - return tempFile + + logger.info("Transcription: $transcription") + + // Write transcription to a temporary file + tempFile = File.createTempFile("transcription", ".txt") + tempFile?.writeText(transcription) + + logger.info("Transcription written to temporary file. ") } + return tempFile + } } diff --git a/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/TextToForm.kt b/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/TextToForm.kt index cd50f4923c..e200722f02 100644 --- a/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/TextToForm.kt +++ b/android/quest/src/main/java/org/smartregister/fhircore/quest/ui/speechtoform/TextToForm.kt @@ -1,52 +1,69 @@ +/* + * Copyright 2021-2024 Ona Systems, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.smartregister.fhircore.quest.ui.speechtoform import ca.uhn.fhir.interceptor.model.RequestPartitionId.fromJson -import org.hl7.fhir.r4.model.Questionnaire -import org.hl7.fhir.r4.model.QuestionnaireResponse import java.io.File import java.util.logging.Logger +import org.hl7.fhir.r4.model.Questionnaire +import org.hl7.fhir.r4.model.QuestionnaireResponse import org.json.JSONObject class TextToForm(private val geminiClient: GeminiClient) { - private val logger = Logger.getLogger(TextToForm::class.java.name) + private val logger = Logger.getLogger(TextToForm::class.java.name) - /** - * Generates an HL7 FHIR QuestionnaireResponse from a transcript using the provided Questionnaire. - * - * @param transcriptFile The temporary file containing the transcript text. - * @param questionnaire The FHIR Questionnaire to base the response on. - * @return The generated and validated QuestionnaireResponse or null if generation fails. - */ - fun generateQuestionnaireResponse(transcriptFile: File, questionnaire: Questionnaire): QuestionnaireResponse? { - val transcript = transcriptFile.readText() - val prompt = buildPrompt(transcript, questionnaire) + /** + * Generates an HL7 FHIR QuestionnaireResponse from a transcript using the provided Questionnaire. + * + * @param transcriptFile The temporary file containing the transcript text. + * @param questionnaire The FHIR Questionnaire to base the response on. + * @return The generated and validated QuestionnaireResponse or null if generation fails. + */ + fun generateQuestionnaireResponse( + transcriptFile: File, + questionnaire: Questionnaire, + ): QuestionnaireResponse? { + val transcript = transcriptFile.readText() + val prompt = buildPrompt(transcript, questionnaire) - logger.info("Sending request to Gemini...") - val generatedText = geminiClient.generateContent(prompt) + logger.info("Sending request to Gemini...") + val generatedText = geminiClient.generateContent(prompt) - val questionnaireResponseJson = extractJsonBlock(generatedText) ?: return null + val questionnaireResponseJson = extractJsonBlock(generatedText) ?: return null - return try { - val questionnaireResponse = parseQuestionnaireResponse(questionnaireResponseJson) - if (validateQuestionnaireResponse(questionnaireResponse)) { - logger.info("QuestionnaireResponse validated successfully.") - questionnaireResponse - } else { - logger.warning("QuestionnaireResponse validation failed.") - null - } - } catch (e: Exception) { - logger.severe("Error generating QuestionnaireResponse: ${e.message}") - null - } + return try { + val questionnaireResponse = parseQuestionnaireResponse(questionnaireResponseJson) + if (validateQuestionnaireResponse(questionnaireResponse)) { + logger.info("QuestionnaireResponse validated successfully.") + questionnaireResponse + } else { + logger.warning("QuestionnaireResponse validation failed.") + null + } + } catch (e: Exception) { + logger.severe("Error generating QuestionnaireResponse: ${e.message}") + null } + } - /** - * Builds the prompt for the Gemini model. - */ - private fun buildPrompt(transcript: String, questionnaire: Questionnaire): String { - return """ + /** Builds the prompt for the Gemini model. */ + private fun buildPrompt(transcript: String, questionnaire: Questionnaire): String { + return """ Using the following transcript of a conversation between a nurse and a patient: $transcript @@ -54,34 +71,27 @@ class TextToForm(private val geminiClient: GeminiClient) { Generate an HL7 FHIR QuestionnaireResponse as if they had entered that information into the following FHIR Questionnaire: $questionnaire - """.trimIndent() - } + """ + .trimIndent() + } - /** - * Extracts the JSON block from the generated text. - */ - private fun extractJsonBlock(responseText: String): String? { - val start = responseText.indexOf("```json") - if (start == -1) return null - val end = responseText.indexOf("```", start + 7) - return if (end == -1) null else responseText.substring(start + 7, end).trim() - } + /** Extracts the JSON block from the generated text. */ + private fun extractJsonBlock(responseText: String): String? { + val start = responseText.indexOf("```json") + if (start == -1) return null + val end = responseText.indexOf("```", start + 7) + return if (end == -1) null else responseText.substring(start + 7, end).trim() + } - /** - * Parses the JSON string into a QuestionnaireResponse object. - */ - private fun parseQuestionnaireResponse(json: String): QuestionnaireResponse { - return QuestionnaireResponse().apply { - fromJson(JSONObject(json).toString()) - } - } + /** Parses the JSON string into a QuestionnaireResponse object. */ + private fun parseQuestionnaireResponse(json: String): QuestionnaireResponse { + return QuestionnaireResponse().apply { fromJson(JSONObject(json).toString()) } + } - /** - * Validates the QuestionnaireResponse structure. - */ - private fun validateQuestionnaireResponse(qr: QuestionnaireResponse): Boolean { - //todo use SDC validation + /** Validates the QuestionnaireResponse structure. */ + private fun validateQuestionnaireResponse(qr: QuestionnaireResponse): Boolean { + // todo use SDC validation - return true - } + return true + } }