package com.gtis.archive.service.impl;

import com.gtis.archive.service.OcrService;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import org.apache.commons.io.IOUtils;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;
import org.springframework.util.ResourceUtils;

import java.io.*;

/**
 * Created by Think on 2017/8/1.
 */
@Service
public class OcrServiceImpl implements OcrService {
    private final Logger logger = LoggerFactory.getLogger(getClass());

    /**
     * 获取html文件
     *
     * @param location 文件地址
     * @return 获取得文件
     */
    @Override
    public File getHtmlFile(String location) {
        try {
            return ResourceUtils.getFile(location);
        } catch (FileNotFoundException e) {
            logger.error("there is no html file exist at {}", location);
        }
        return null;
    }

    /**
     * 解析pdf文件内容
     *
     * @param pdfPath pdf路径
     * @return 解析后的内容
     */
    @Override
    public String readPdf(String pdfPath) {
        PdfReader reader = null;
        StringBuilder buffer = new StringBuilder("");

        try {
            reader = new PdfReader(pdfPath);
            PdfReaderContentParser parser = new PdfReaderContentParser(reader);
            int num = reader.getNumberOfPages();
            logger.info("pdf的页数为{}", num);
            TextExtractionStrategy strategy;
            for (int i = 1; i <= num; i++) {
                strategy = parser.processContent(i,
                        new SimpleTextExtractionStrategy());
                buffer.append(strategy.getResultantText());
            }
        } catch (IOException e) {
            logger.error("解析pdf错误, {}", e.getMessage());
        }
        return buffer.toString();
    }

    /**
     * 读取txt的内容
     * @return txt内容
     */
    @Override
    public String readTxt(String txtPath) {
        BufferedReader br = null;
        StringBuilder sb = null;
        try {
            String s;
            sb = new StringBuilder();
            br = new BufferedReader(new InputStreamReader(new FileInputStream(txtPath), "UTF-8"));
            while ((s = br.readLine()) != null) {
                sb.append(s).append("\n");
            }
        } catch (IOException e) {
            logger.error("读取txt文件错误, {}", e.getMessage());
        } finally {
            IOUtils.closeQuietly(br);
        }
        return sb.toString();
    }

    /**
     * 读取word内容
     *
     * @param wordPath word路径
     * @param type
     * @return word内容
     */
    @Override
    public String readWord(String wordPath, String type) {
        BufferedInputStream bis = null;
        String content = null;
        try {
            bis = new BufferedInputStream(new FileInputStream(wordPath));
            if ("doc".equals(type)) {
                HWPFDocument document = new HWPFDocument(bis);
                WordExtractor extractor = new WordExtractor(document);
                content = extractor.getText();
            } else if ("docx".equals(type)) {
                XWPFDocument xdoc = new XWPFDocument(bis);
                XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
                content = extractor.getText();
            }
        } catch (FileNotFoundException e) {
            logger.error("word文件不存在, {}", e.getMessage());
        } catch (IOException e) {
            logger.error("读取文件错误, {}", e.getMessage());
        } finally {
            IOUtils.closeQuietly(bis);
        }
        return content;
    }

    /**
     * 读取excel内容
     *
     * @param excelPath excel路径
     * @param type
     * @return excel内容
     */
    @Override
    public String readExcel(String excelPath, String type) {
        BufferedInputStream bis = null;
        StringBuilder content = new StringBuilder();
        Workbook workbook = null;

        if (!"xlsx".equals(type) && !"xls".equals(type)) {
            throw new IllegalArgumentException("不支持的文件类型");
        }

        try {
            bis = new BufferedInputStream(new FileInputStream(excelPath));
            workbook = "xls".equals(type) ? new HSSFWorkbook(bis) : new XSSFWorkbook(bis);
            // 对sheet进行操作
            for (int numSheet = 0; numSheet < workbook.getNumberOfSheets(); numSheet++) {
                Sheet sheet = workbook.getSheetAt(numSheet);
                if (sheet == null) {
                    continue;
                }

                for (int rowNum = 1; rowNum <= sheet.getLastRowNum(); rowNum++) {
                    Row row = sheet.getRow(rowNum);
                    int minCellIdx = row.getFirstCellNum();
                    int maxCellIdx = row.getLastCellNum();
                    for (int cellIdx = minCellIdx; cellIdx < maxCellIdx; cellIdx++) {
                        Cell cell = row.getCell(cellIdx);
                        if (cell == null) {
                            continue;
                        }
                        content.append(getStringValue(cell)).append(" ");
                    }
                    content.append("\n");
                }
            }
        } catch (IOException e) {
            logger.error(e.getMessage());
        } finally {
            IOUtils.closeQuietly(bis);
        }
        return content.toString();
    }

    /**
     * 获取单元格的值
     * @param cell 单元格
     * @return 单元格的值
     */
    private String getStringValue(Cell cell) {
        switch (cell.getCellType()) {
            case Cell.CELL_TYPE_BOOLEAN:
                return cell.getBooleanCellValue() ? "TRUE" : "FALSE";
            case Cell.CELL_TYPE_FORMULA:
                return cell.getCellFormula();
            case Cell.CELL_TYPE_NUMERIC:
                return cell.getNumericCellValue() + "";
            case Cell.CELL_TYPE_STRING:
                return cell.getStringCellValue();
            case Cell.CELL_TYPE_ERROR:
                return "error";
            default:
                return "";
        }
    }
}
