PHP 提取PDF文件内容

发布时间 2023-05-29 16:18:19作者: 天心PHP

这里以提取 亚马逊日期范围报告PDF汇总 的数据

根据路径下载PDF

 /**
     * description: 文件下载
     * @throws CException
     */
    public function getFile($url, $save_dir = '', $filename = '', $type = 0)
    {
        if (trim($url) == '') {
            return false;
        }
        if (trim($save_dir) == '') {
            $save_dir = './';
        }
        if (0 !== strrpos($save_dir, '/')) {
            $save_dir .= '/';
        }
        //创建保存目录
        if (!file_exists($save_dir) && !mkdir($save_dir, 0777, true)) {
            return false;
        }
        //获取远程文件所采用的方法
        if ($type) {
            $ch = curl_init();
            $timeout = 5;
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
            curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
            $content = curl_exec($ch);
            curl_close($ch);
        } else {
            ob_start();
            readfile($url);
            $content = ob_get_contents();
            ob_end_clean();
        }
        //echo $content;
        $size = strlen($content);
        //文件大小
        $fp2 = @fopen($save_dir . $filename, 'a');
        @fwrite($fp2, $content);
        @fclose($fp2);
        unset($content, $url);
        return array(
            'status' => 1,
            'file_name' => $filename,
            'save_path' => $save_dir . $filename,
            'file_size' => $size
        );
    }

服务器需要开启 shell_exec

shell_exec("pdftotext -layout GAN-IT_242_510181.pdf  GAN-IT_242_510181.txt");

得到按行解析的txt

 在就提取txt文件的 11行到16行数据

$filename = $accountnamelist[$val['accountid']].'_'.$val['accountid'].'_'.$val['planid'];
        if (file_exists($url .$filename. '.pdf')) {
            @unlink($url . $filename . '.pdf');
        }
        $res = $modelre->getFile($val['url'], $url, $filename.'.pdf');//下载pdf
        $modelpdf->deleteAll('account_id=:account_id and batchnumber=:bn', [':account_id' => $val['accountid'], ':bn' => $val['batchnumber']]);
        shell_exec("pdftotext -layout ".$url.$filename.".pdf  ".$url.$filename.".txt");
        if(!file_exists($url.$filename.".txt")){return false;}
        $sum = 0;
        $content = $modelre->readTXT($url.$filename.".txt");
        $dlist = $list = $data =[];
        foreach ($content as $keyp=>$valp){
            if($keyp>=10 && $keyp<=15 && trim($valp)){
                $res = preg_replace("/\s{2,}/u","_",trim($valp));//吧两个以上的空格转换为_
                $reslist = explode('_',$res);
                $sub = str_replace(',', '', $reslist[2]);
                $sum += (double)$sub;//得到数字相加的结果
                $dlist[] = $reslist;
            }
        }
        $data['account_id'] = $val['accountid'];
        $data['account_name'] = $accountnamelist[$val['accountid']];
        $data['description'] = json_encode($dlist);
        $data['total_price'] = $sum;
        $data['url'] = '/upload/pdflabel/'.$filename.'.pdf';
        $data['batchnumber'] = $val['batchnumber'];
        $data['create_time'] = date('Y-m-d H:i:s');
        $list[] = $data;
        $modelpdf->batchReplaceAll("{{amazon_report_zn_pdf}}", array_keys($list[0]), $list);
        $znmodel->updateAll(['is_down' => 1, 'update_at' => date("Y-m-d H:i:s")], "id='{$val['id']}'");
        @unlink($url . $filename . '.txt');
        return true;