Feb. 19th, 2013
дас ист фантастиш
Feb. 19th, 2013 10:34 am
       try
        {
            if (header.startsWith( PDF_HEADER )) 
            {
                float pdfVersion = Float. parseFloat(
                        header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER .length()+3) ) );
                document.setVersion( pdfVersion );
            }
            else 
            {
                float pdfVersion = Float. parseFloat(
                        header.substring( FDF_HEADER.length(), Math.min( header.length(), FDF_HEADER.length()+3) ) );
                document.setVersion( pdfVersion );
            }
        }
        catch ( NumberFormatException e )
        {
            throw new IOException( "Error getting pdf version:" + e );
        } 
(from pdfbox library)
а вот и мой кот
Feb. 19th, 2013 11:57 pm
  def extractTextFromImagesHiddenInPdf(pdf: File): String = {
    import OS._
    val fName = {
      val n = pdf.getName
      n.substring(0, n lastIndexOf '.')
    }
    def withExtension(x: String) = tempFile(fName, x)
    val png = withExtension("png")
    val tif = withExtension("tif")
    val noname = withExtension("")
    val txt = withExtension("txt")
    def exec(cmd: String) = Runtime.getRuntime.exec(cmd)
    def oops(txt: String, args: AnyRef*) = throw new IllegalStateException(txt.format(args:_*))
    exec("convert -density 600 %s -monochrome %s".format(pdf, png))
    val firstPage = if (png.exists) png else tempFile(fName + "-0", "png")
    if (!firstPage.exists) oops("Failed to create %s or %s", png, firstPage)
    exec("convert %s %s".format(firstPage, tif))
    if (!tif.exists) oops("Failed to create %s from %s", tif, firstPage)
    exec("tesseract %s %s".format(tif, noname))
    if (!txt.exists) oops("Failed to create %s from %s", txt, tif)
    Source.fromFile(txt).mkString
  }
(и в продакшен)
![[personal profile]](https://www.dreamwidth.org/img/silk/identity/user.png)


