/* Convert pdf to text. * * Last modified: TS, 24/6/2009 * * Strips non-printable characters. * Checks that filename has extension .pdf and is actually a valid file * HTML entities not encoded (as this is intended to be used with my search snippet which will encode the extract anyway) * * Requires pdftotext as found in xpdf-utils * Requires docmanager.class.php * * for OnDocFormSave */ /* ----- CONFIG ----- */ define('SHELL_COMMAND_FIRST', 'pdftotext '); // First part of shell command - before filename (includes following space) define('SHELL_COMMAND_LAST', ' -'); // Second part of shell command - after filename (includes preceeding space) define('TEXT_TV', 'downloadText'); // modx TV to hold text define('PDF_FILE_TV', 'download'); // modx TV that points to PDF file /* ----- ------ ----- */ require_once($modx->config['base_path'].'assets/libs/docmanager/document.class.inc.php'); $doc_tvs = $modx->getTemplateVarOutput(array(PDF_FILE_TV), $id); $pdf_full_file_path = $modx->config['base_path'].$doc_tvs[PDF_FILE_TV]; if ($doc_tvs[PDF_FILE_TV] && strtolower(substr($pdf_full_file_path, -4)) == '.pdf' && is_file($pdf_full_file_path)) { $doc = new Document($id); $doc->SetTV(TEXT_TV, mysql_real_escape_string(preg_replace('/[^[:print:]]/', '', shell_exec(SHELL_COMMAND_FIRST.'"'.$pdf_full_file_path.'"'.SHELL_COMMAND_LAST)))); // FOLLOWING CODE IS VERY IMPORTANT FOR SECURITY!!! $unchanged_text_tv_fields = array('pagetitle', 'longtitle', 'menutitle', 'introtext', 'type', 'contentType', 'description', 'alias', 'content', 'link_attributes'); foreach ($unchanged_text_tv_fields as $field) { $doc->Set($field, $modx->db->escape($doc->Get($field))); } $doc->Save(); }