Feb. 20th, 2013
def extractTextFromImagesHiddenInPdf(pdf: File): String = { import OS._ val fName = { val n = pdf.getName n.substring(0, n lastIndexOf '.') } def withExtension(x: String) = tempFile(fName, x) val png = withExtension("png") val tif = withExtension("tif") val noname = withExtension("") val txt = withExtension("txt") png.delete; tif.delete; txt.delete def oops(txt: String, args: AnyRef*) = throw new IllegalStateException(txt.format(args:_*)) val fail = (errors: Seq[ErrorDetails[String]]) => oops("Error executing " + errors.head.bad +"\n" + (errors mkString "\n")) exec("convert", "-density", "600", pdf, "-monochrome", png) onError fail val firstPage = if (png.exists) png else tempFile(fName + "-0", "png") if (!firstPage.exists) oops("Failed to create %s or %s", png, firstPage) exec("convert", firstPage, tif) onError fail if (!tif.exists) oops("Failed to create %s from %s", tif, firstPage) exec("tesseract", tif, noname) onError fail if (!txt.exists) oops("Failed to create %s from %s", txt, tif) Source.fromFile(txt).mkString
where
def exec(cmd: Any*): Result[String] = { val args: Array[String] = cmd.toArray.map(_.toString) val command = args mkString " " val process = Runtime.getRuntime.exec(args) val code = process.waitFor if (code == 0) Good(command) else { val errorDump = Source.fromInputStream(process.getErrorStream).mkString val errors = ErrorDetails(errorDump, command) Bad(List(errors)) } }
where
sealed trait Result[T] { def isGood: Boolean def isBad: Boolean = !isGood val listErrors: Seq[ErrorDetails[T]] def onError(op: Seq[ErrorDetails[T]] => Unit): Unit } final case class Good[T](value: T) extends Result[T] { override def isGood = true override val listErrors: Seq[ErrorDetails[T]] = Nil override def onError(op: Seq[ErrorDetails[T]] => Unit): Unit = {} } final case class Bad[T](listErrors: Seq[ErrorDetails[T]]) extends Result[T] { override def isGood = false override def onError(op: Seq[ErrorDetails[T]] => Unit): Unit = {op(listErrors)} } case class ErrorDetails[T](description: String, bad: T) { override def toString = "Error: " + description + " in " + bad }
dumping stuff from stream to file fast
Feb. 20th, 2013 01:51 pmval out = new FileOutputStream(myFile).getChannel val in: InputStream = sampleResourcePdfBSBCTX.openStream val ch = Channels.newChannel(in) try { while (true) { val nBytes = in.available out.transferFrom(ch, out.position, nBytes) out.position(out.position + nBytes) } } finally { out.close() } val text = PDF.extractTextFromImagesHiddenInPdf(pdf) text contains "Claim No.: 30l8507l5lSOX" mustBe true } }