Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support zip for output of transcription service #253

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 5 additions & 11 deletions backend/app/extraction/ExternalTranscriptionExtractor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ case class SignedUrl(url: String, key: String)
object SignedUrl {
implicit val formats = Json.format[SignedUrl]
}
case class OutputBucketUrls(text: SignedUrl, srt: SignedUrl, json: SignedUrl)
case class OutputBucketKeys(text: String, srt: String, json: String)
case class OutputBucketUrls(zip: SignedUrl)
case class OutputBucketKeys(zip: String)
case class TranscriptionJob(id: String, originalFilename: String, inputSignedUrl: String, sentTimestamp: String,
userEmail: String, transcriptDestinationService: String, outputBucketUrls: OutputBucketUrls,
languageCode: String, translate: Boolean, translationOutputBucketUrls: OutputBucketUrls,
Expand Down Expand Up @@ -118,19 +118,13 @@ class ExternalTranscriptionExtractor(index: Index, transcribeConfig: TranscribeC
override def priority = 2

private def getOutputBucketUrls(blobUri: String): Either[Failure, OutputBucketUrls] = {
val srtKey = s"srt/$blobUri.srt"
val jsonKey = s"json/$blobUri.json"
val textKey = s"text/$blobUri.txt"
val zipKey = s"zip/$blobUri.zip"

val bucketUrls = for {
srt <- outputStorage.getUploadSignedUrl(srtKey)
json <- outputStorage.getUploadSignedUrl(jsonKey)
text <- outputStorage.getUploadSignedUrl(textKey)
zip <- outputStorage.getUploadSignedUrl(zipKey)
} yield {
OutputBucketUrls(
SignedUrl(text, textKey),
SignedUrl(srt, srtKey),
SignedUrl(json, jsonKey)
SignedUrl(zip, zipKey),
)
}

Expand Down
53 changes: 38 additions & 15 deletions backend/app/extraction/ExternalTranscriptionWorker.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ import services.{ObjectStorage, TranscribeConfig}
import utils.Logging
import utils.attempt.{DocumentUpdateFailure, ExternalTranscriptionOutputFailure, Failure, JsonParseFailure}

import java.io.{BufferedReader, InputStream, InputStreamReader}
import java.nio.charset.StandardCharsets
import java.util.zip.ZipInputStream
import scala.concurrent.ExecutionContext
import scala.jdk.CollectionConverters.CollectionHasAsScala
import scala.util.Try
import scala.util.{Success, Try, Failure => ScalaFailure}

case class TranscriptionMessageAttribute(receiveCount: Int, messageGroupId: String)
case class TranscriptionTexts(transcript: String, translation: Option[String])
Expand Down Expand Up @@ -83,20 +85,41 @@ class ExternalTranscriptionWorker(manifest: WorkerManifest, amazonSQSClient: Ama
}

private def getTranscriptionTexts(transcriptionOutput: TranscriptionOutputSuccess): Either[Failure, TranscriptionTexts] = {
val transcript = blobStorage.get(transcriptionOutput.outputBucketKeys.text)

transcript.flatMap { transcriptStream =>
val transcriptText = new String(transcriptStream.readAllBytes(), StandardCharsets.UTF_8)

transcriptionOutput.translationOutputBucketKeys match {
case Some(keys) =>
val translation = blobStorage.get(keys.text)
translation.map { translationStream =>
val text = new String(translationStream.readAllBytes(), StandardCharsets.UTF_8)
TranscriptionTexts(transcriptText, Some(text))
}
case None => Right(TranscriptionTexts(transcriptText, None))
}
val transcript = blobStorage.get(transcriptionOutput.outputBucketKeys.zip).flatMap { transcriptStream =>
extractFileFromZip(transcriptStream, "transcript.txt")
}


val translation = transcriptionOutput.translationOutputBucketKeys match {
case Some(keys) =>
blobStorage.get(keys.zip).flatMap { translationStream =>
extractFileFromZip(translationStream, "transcript.txt")
}.map(Some(_)) // Wrap in Some to indicate translation exists
case None => Right(None) // No translation zip, return None
}

for {
transcriptText <- transcript
translationText <- translation
} yield TranscriptionTexts(transcriptText, translationText)
}

private def extractFileFromZip(zipStream: InputStream, fileName: String): Either[Failure, String] = {
val zipInputStream = new ZipInputStream(zipStream, StandardCharsets.UTF_8)

Try {
Iterator.continually(zipInputStream.getNextEntry)
.takeWhile(_ != null) // Process entries until we hit null
.find(entry => !entry.isDirectory && entry.getName == fileName)
.map { _ =>
val reader = new BufferedReader(new InputStreamReader(zipInputStream, StandardCharsets.UTF_8))
val content = reader.lines().toArray.mkString("\n") // Read all lines into a string
zipInputStream.closeEntry()
content
}.toRight(ExternalTranscriptionOutputFailure(s"File '$fileName' not found in the ZIP archive"))
} match {
case Success(result) => result
case ScalaFailure(exception) => Left(ExternalTranscriptionOutputFailure(s"Failed to extract '$fileName': ${exception.getMessage}"))
}
}

Expand Down