Feb. 19th, 2013
дас ист фантастиш
Feb. 19th, 2013 10:34 amtry { if (header.startsWith( PDF_HEADER )) { float pdfVersion = Float. parseFloat( header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER .length()+3) ) ); document.setVersion( pdfVersion ); } else { float pdfVersion = Float. parseFloat( header.substring( FDF_HEADER.length(), Math.min( header.length(), FDF_HEADER.length()+3) ) ); document.setVersion( pdfVersion ); } } catch ( NumberFormatException e ) { throw new IOException( "Error getting pdf version:" + e ); }
(from pdfbox library)
а вот и мой кот
Feb. 19th, 2013 11:57 pmdef extractTextFromImagesHiddenInPdf(pdf: File): String = { import OS._ val fName = { val n = pdf.getName n.substring(0, n lastIndexOf '.') } def withExtension(x: String) = tempFile(fName, x) val png = withExtension("png") val tif = withExtension("tif") val noname = withExtension("") val txt = withExtension("txt") def exec(cmd: String) = Runtime.getRuntime.exec(cmd) def oops(txt: String, args: AnyRef*) = throw new IllegalStateException(txt.format(args:_*)) exec("convert -density 600 %s -monochrome %s".format(pdf, png)) val firstPage = if (png.exists) png else tempFile(fName + "-0", "png") if (!firstPage.exists) oops("Failed to create %s or %s", png, firstPage) exec("convert %s %s".format(firstPage, tif)) if (!tif.exists) oops("Failed to create %s from %s", tif, firstPage) exec("tesseract %s %s".format(tif, noname)) if (!txt.exists) oops("Failed to create %s from %s", txt, tif) Source.fromFile(txt).mkString }
(и в продакшен)