文件解析API

文件解析API主要有两个接口:上传解析接口和导出接口

接下来通过简单的示例来演示如何接入文档解析API

public static void main(String[] args) throws IOException, URISyntaxException {
File file = new File(""); // 上传的文件,自行设置
String userName = ""; // 用户名
String clientId = ""; // 用户中心获取api key
String from = ""; // 详见语言表
String to = ""; // 详见语言表
String de = "";
//文档上传解析
Map<String, Object> parseResult = parseFile(file.getAbsolutePath(), userName, clientId, from, to, de);
if(!"0".equals(String.valueOf(parseResult.get("errCode")))){
throw new RuntimeException((String) parseResult.get("errMsg"));
}
String docId = (String) parseResult.get("docId");
String jsonSegments = JacksonUtil.toJSon(parseResult.get("segments"));
List<SimpleSegment> segments = JacksonUtil.readValue(jsonSegments, new TypeReference<List<SimpleSegment>>(){});
// TODO 可以调用机器翻译,或者调用记忆库进行翻译
segments = new SegmentTranslator(segments, (sentence) -> "测试文本").trans();
//解析后的文档导出
InputStream inputStream = exportFile(userName, clientId, docId, segments);
if(inputStream == null){
throw new RuntimeException("文件下载失败!");
}
String downloadFilePath = ""; // 下载文件的路径

FileUtil.inputStreamToFile(inputStream, downloadFilePath);
}
  • 验证上传接口

上传文件,并进行解析,需要传入文件路径,用户名,用户APIkey,源语言,目标语言等参数

public static Map<String, Object> parseFile(String filePath, String userName, String clientId, String from,
String to, String de) throws IOException {
Map<String, String> params = new HashMap<>();
params.put("user_name", userName);
params.put("client_id", clientId);
params.put("from", from);
params.put("to", to);
params.put("de", de);

String response = HttpClientUtil.uploadFileByPost(FileApiUrlConstant.PARSE_FILE, filePath,
"file", params, 5 * 10000);
return JacksonUtil.readValue(response, new TypeReference<Map<String, Object>>(){});
}
  • 工具类

定义一个工具类HttpClientUtil

//默认socket超时时间
private static final int DEFAULT_SOCKET_TIMEOUT = 5000;
//默认连接超时时间
private static final int DEFAULT_CONNECT_TIMEOUT = 5000;
//默认请求超时时间
private static final int DEFAULT_CONNECTION_REQUEST_TIMEOUT = 5000;
private static RequestConfig defaultRequestConfig;
static {
defaultRequestConfig = RequestConfig.custom()
.setSocketTimeout(DEFAULT_SOCKET_TIMEOUT)
.setConnectTimeout(DEFAULT_CONNECT_TIMEOUT)
.setConnectionRequestTimeout(DEFAULT_CONNECTION_REQUEST_TIMEOUT)
.build();
}
public static String doGetStr(String url, Map<String, String> params) throws IOException, URISyntaxException {
HttpClient httpClient = HttpClientBuilder.create().build();
HttpUriRequest post = createRequest(params, url, HttpGet.METHOD_NAME);

return getResponseContent(httpClient.execute(post));
}

public static String uploadFileByPost(String url, String filePath, String fileParamKey, Map<String, String> params,
int connectTimout) throws IOException {
try(CloseableHttpClient httpClient = HttpClients.createDefault();){
File file = new File(filePath);
MultipartEntityBuilder builder = MultipartEntityBuilder.create()
.setCharset(StandardCharsets.UTF_8)
.setMode(HttpMultipartMode.BROWSER_COMPATIBLE)
.addBinaryBody(fileParamKey, new FileInputStream(filePath),
ContentType.MULTIPART_FORM_DATA, file.getName());
params.forEach(builder::addTextBody);

HttpPost post = new HttpPost(url);
post.setEntity(builder.build());
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(DEFAULT_SOCKET_TIMEOUT)
.setConnectTimeout(connectTimout)
.setConnectionRequestTimeout(DEFAULT_CONNECTION_REQUEST_TIMEOUT)
.build();
post.setConfig(requestConfig);

return getResponseContent(httpClient.execute(post));
}
}

private static HttpUriRequest createRequest(Map<String, String> paramsMap, String url, String method) throws URISyntaxException {
List<NameValuePair> params = paramsMap.entrySet()
.stream()
.map(e -> new BasicNameValuePair(e.getKey(), e.getValue()))
.collect(Collectors.toList());
switch (method){
case HttpPost.METHOD_NAME:
HttpPost post = new HttpPost();
post.setConfig(defaultRequestConfig);
post.setURI(new URI(url));
post.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));

return post;
case HttpGet.METHOD_NAME:
default:
return RequestBuilder.get()
.setConfig(defaultRequestConfig)
.setCharset(StandardCharsets.UTF_8)
.setUri(url)
.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8))
.build();
}
}

private static String getResponseContent(HttpResponse httpResponse) throws IOException {
HttpEntity responseEntity = httpResponse.getEntity();
if (responseEntity != null) {
return EntityUtils.toString(responseEntity, StandardCharsets.UTF_8); // 将响应内容转换为字符串
}

throw new RuntimeException("请求的内容为空,请检查");
}

public static String doPostStr(String url, Map<String, String> params) throws IOException, URISyntaxException {
return getResponseContent(doPost(url, params));
}

public static InputStream doPostInputStream(String url, Map<String, String> params) throws IOException, URISyntaxException {
return doPost(url, params).getEntity().getContent();
}

public static HttpResponse doPost(String url, Map<String, String> params) throws IOException, URISyntaxException {
HttpClient client = HttpClientBuilder.create().build();
HttpUriRequest request = createRequest(params, url, HttpPost.METHOD_NAME);

return client.execute(request); // 执行请求并获取内容
}

定义一个json转换的工具类JacksonUtil

public final class JacksonUtil {

private static ObjectMapper objectMapper = new ObjectMapper();

public static <T> T readValue(String jsonStr, TypeReference<T> valueTypeRef) throws IOException {
return objectMapper.readValue(jsonStr, valueTypeRef);
}

public static String toJSon(Object object) throws JsonProcessingException {
return objectMapper.writeValueAsString(object);
}
}

定义一个文件转换的工具类FileUtil

public class FileUtil {

public static void inputStreamToFile(InputStream inputStream, String filePath) throws IOException {
inputStreamToFile(inputStream, new File(filePath));
}

public static void inputStreamToFile(InputStream inputStream, File file) throws IOException {
if (!file.exists()) {
file.createNewFile();
}
FileOutputStream fs = new FileOutputStream(file);
int byteIndex = 0;
byte[] buffer = new byte[4096];
while ((byteIndex = inputStream.read(buffer)) != -1) {
fs.write(buffer, 0, byteIndex);
}
}

}
  • 验证导出接口

对解析后的文件进行导出

public static InputStream exportFile(String userName, String clientId, String docId,
List<SimpleSegment> segments) throws IOException, URISyntaxException {
Map<String, String> parameters = new HashMap<>();
parameters.put("user_name", userName);
parameters.put("client_id", clientId);
parameters.put("doc_id", docId);
parameters.put("segments", JacksonUtil.toJSon(segments));

return HttpClientUtil.doPostInputStream(FileApiUrlConstant.EXPORT_FILE, parameters);
}
  • 常量类

定义API接口的url常量类

public class FileApiUrlConstant {

private static String[] hosts = new String[]{"http://api.tmxmall.com"}; // 配置请求的host

private static String serverName = hosts[0];

public static final String PARSE_FILE = serverName + "/v1/http/parseFile";

public static final String EXPORT_FILE = serverName + "/v1/http/exportFile";
}
  • 实体类

需要用到的两个实体类Atom和SimpleSegment

public class Atom implements Serializable {
private static final long serialVersionUID = 1L;

private String atomId;
private String data;
private String textStyle;

private boolean isHidden = false;
private String tagType;
private Integer tagId;

public Atom(){

}

public String getAtomId() {
return atomId;
}

public void setAtomId(String atomId) {
this.atomId = atomId;
}

public String getData() {
return data;
}

public void setData(String data) {
this.data = data;
}

public String getTextStyle() {
return textStyle;
}

public void setTextStyle(String textStyle) {
this.textStyle = textStyle;
}

public boolean isHidden() {
return isHidden;
}

public void setHidden(boolean hidden) {
isHidden = hidden;
}

public String getTagType() {
return tagType;
}

public void setTagType(String tagType) {
this.tagType = tagType;
}

public Integer getTagId() {
return tagId;
}

public void setTagId(Integer tagId) {
this.tagId = tagId;
}

@Override
public String toString() {
return "Atom{" +
"atomId='" + atomId + '\'' +
", data='" + data + '\'' +
", textStyle='" + textStyle + '\'' +
", isHidden=" + isHidden +
", tagType='" + tagType + '\'' +
", tagId=" + tagId +
'}';
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Atom atom = (Atom) o;
return isHidden == atom.isHidden &&
Objects.equals(atomId, atom.atomId) &&
Objects.equals(data, atom.data) &&
Objects.equals(textStyle, atom.textStyle) &&
Objects.equals(tagType, atom.tagType) &&
Objects.equals(tagId, atom.tagId);
}

@Override
public int hashCode() {
return Objects.hash(atomId, data, textStyle, isHidden, tagType, tagId);
}
}

以及

public class SimpleSegment implements Serializable {
private static final long serialVersionUID = 1L;

private String _id;

private String documentId;

private String translationUnitId;

private List<Atom> srcSegmentAtoms;

private List<Atom> tgtSegmentAtoms;


public String get_id() {
return _id;
}

public void set_id(String _id) {
this._id = _id;
}

public String getDocumentId() {
return documentId;
}

public void setDocumentId(String documentId) {
this.documentId = documentId;
}

public String getTranslationUnitId() {
return translationUnitId;
}

public void setTranslationUnitId(String translationUnitId) {
this.translationUnitId = translationUnitId;
}

public List<Atom> getSrcSegmentAtoms() {
return srcSegmentAtoms;
}

public void setSrcSegmentAtoms(List<Atom> srcSegmentAtoms) {
this.srcSegmentAtoms = srcSegmentAtoms;
}

public List<Atom> getTgtSegmentAtoms() {
return tgtSegmentAtoms;
}

public void setTgtSegmentAtoms(List<Atom> tgtSegmentAtoms) {
this.tgtSegmentAtoms = tgtSegmentAtoms;
}

@Override
public String toString() {
return "SimpleSegment{" +
"_id='" + _id + '\'' +
", documentId='" + documentId + '\'' +
", translationUnitId='" + translationUnitId + '\'' +
", srcSegmentAtoms=" + srcSegmentAtoms +
", tgtSegmentAtoms=" + tgtSegmentAtoms +
'}';
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SimpleSegment that = (SimpleSegment) o;
return Objects.equals(_id, that._id) &&
Objects.equals(documentId, that.documentId) &&
Objects.equals(translationUnitId, that.translationUnitId) &&
Objects.equals(srcSegmentAtoms, that.srcSegmentAtoms) &&
Objects.equals(tgtSegmentAtoms, that.tgtSegmentAtoms);
}

@Override
public int hashCode() {
return Objects.hash(_id, documentId, translationUnitId, srcSegmentAtoms, tgtSegmentAtoms);
}
}
  • 翻译工具类
public class SegmentTranslator {       // TODO 非译元素的特殊处理

    private static final String REGULAR_STYLE = "regular";

    private List<SimpleSegment> toBeTranslated;

    private Translator translator;

    public SegmentTranslator(List<SimpleSegment> toBeTranslated, Translator translator) {
        this.toBeTranslated = toBeTranslated;
        this.translator = translator;
    }

    public List<SimpleSegment> trans(){
        if(toBeTranslated == null || toBeTranslated.isEmpty()){
            return toBeTranslated;
        }

        for(SimpleSegment segment : toBeTranslated){
            segment.setTgtSegmentAtoms(analysisAtom(segment.getSrcSegmentAtoms()));
        }

        return toBeTranslated;
    }

    private  List<Atom> analysisAtom(List<Atom> list){
        int length = 0;
        int count = 0;
        for(Atom atom : list){
            if(StringUtils.equals(REGULAR_STYLE, atom.getTextStyle())){
                length += atom.getData().length();
                if(atom.getData().length() <= 3){
                    count++;
                }
            }
        }
        if(length > 100 && count > 6){
            return extractText(list);
        }
        if(length > 100 && count >= 3 && count <= 5){
            return appendText(list);
        }
        if(length > 100 && count < 3){
            return segmentedText(list);
        }
        if(length > 20 && length <= 100 && count > 15){
            return list;
        }
        if(length > 20 && length <= 100 && count < 2){
            return segmentedText(list);
        }
        if(length > 20 && length <= 100){
            return specialAppendText(list);
        }
        if(length <= 20 && count < 2){
            return segmentedText(list);
        }
        if(length <= 20 && count > 5){
            // 小文本且小单词数量过多则放弃翻译直接原文填充
            return list;
        }
        return specialAppendText(list);
    }

    // 文本提取方法,将纯文本内容提取出进行翻译,并存放到对应原文中的第一个文本atom中
    private  List<Atom> extractText(List<Atom> list) {
        StringBuilder sb = new StringBuilder();
        List<Atom> tgtSegmentAtoms = new ArrayList<Atom>();
        Atom tgtAtom = null;
        Atom firstAtom = new Atom();
        int temp = 0;
        for(Atom atom : list){
            tgtAtom = new Atom();
            if(StringUtils.equals(atom.getTextStyle(), REGULAR_STYLE)){
                sb.append(atom.getData());
                if(temp == 0){
                    firstAtom.setData(atom.getData());
                    firstAtom.setTagId(atom.getTagId());
                    firstAtom.setTagType(atom.getTagType());
                    firstAtom.setTextStyle(atom.getTextStyle());
                    firstAtom.setHidden(atom.isHidden());
                    firstAtom.setAtomId(atom.getAtomId());
                    temp ++;
                }
                continue;
            }
            tgtAtom.setData(atom.getData());
            tgtAtom.setTagId(atom.getTagId());
            tgtAtom.setTagType(atom.getTagType());
            tgtAtom.setTextStyle(atom.getTextStyle());
            tgtAtom.setHidden(atom.isHidden());
            tgtAtom.setAtomId(atom.getAtomId());
            tgtSegmentAtoms.add(tgtAtom);
        }
        String res = sb.toString();
        if(StringUtils.isNotEmpty(res) && !res.matches("\\s+")){
            res = translator.trans(res);
        }
        firstAtom.setData(res);
        tgtSegmentAtoms.add(0, firstAtom);
        return tgtSegmentAtoms;
    }

    // 文本追加方法,必须保证发送的内容长度必须大于5否则将该段追加到下一个文本atom中
    private  List<Atom> appendText(List<Atom> list) {
        StringBuilder temp = new StringBuilder();
        int count = 0;
        List<Atom> tgtSegmentAtoms = new ArrayList<Atom>();
        Atom tgtAtom = null;
        for(int i = 0; i < list.size(); i++){
            Atom atom = list.get(i);
            tgtAtom = new Atom();
            String data = "";
            if(StringUtils.equals(atom.getTextStyle(), REGULAR_STYLE)){
                // count用来记录每次文本atom的位置,循环结束后count的值就是最后一个文本atom的位置
                count = i;
                temp.append(atom.getData());
                if(temp.length() > 5){
                    data = translator.trans(temp.toString());
                    temp = new StringBuilder();
                }
            }else {
                // 非文本atom直接取本身的内容
                data = atom.getData();
            }
            tgtAtom.setData(data);
            tgtAtom.setTagId(atom.getTagId());
            tgtAtom.setTagType(atom.getTagType());
            tgtAtom.setTextStyle(atom.getTextStyle());
            tgtAtom.setHidden(atom.isHidden());
            tgtAtom.setAtomId(atom.getAtomId());
            tgtSegmentAtoms.add(tgtAtom);
        }

        // temp还存在残余内容,要找到tgtAtoms中最后一个文本atom将其内容替换
        if(StringUtils.isNotEmpty(temp.toString())){
            Atom tempAtom = tgtSegmentAtoms.get(count);
            tempAtom.setData(translator.trans(temp.toString()));
            tgtSegmentAtoms.set(count, tempAtom);
        }
        return tgtSegmentAtoms;
    }

    // 按照原本atom断句发送
    private  List<Atom> segmentedText(List<Atom> list) {
        List<Atom> tgtSegmentAtoms = new ArrayList<Atom>();
        Atom tgtAtom = null;
        for(Atom atom : list){
            tgtAtom = new Atom();
            if(StringUtils.equals(atom.getTextStyle(), REGULAR_STYLE)
                    && StringUtils.isNotEmpty(atom.getData()) && !atom.getData().matches("\\s+")){
                tgtAtom.setData(translator.trans(atom.getData()));
            }else {
                tgtAtom.setData(atom.getData());
            }
            tgtAtom.setTagId(atom.getTagId());
            tgtAtom.setTagType(atom.getTagType());
            tgtAtom.setTextStyle(atom.getTextStyle());
            tgtAtom.setHidden(atom.isHidden());
            tgtAtom.setAtomId(atom.getAtomId());
            tgtSegmentAtoms.add(tgtAtom);
        }
        return tgtSegmentAtoms;
    }

    // 针对小段文本字符的特殊追加方法
    private  List<Atom> specialAppendText(List<Atom> list) {
        StringBuilder temp = new StringBuilder();
        int count = 0;
        Atom tgtAtom = null;
        List<Atom> tgtSegmentAtoms = new ArrayList<Atom>();
        for(int i = 0; i < list.size(); i++){
            Atom atom = list.get(i);
            tgtAtom = new Atom();
            if(StringUtils.equals(atom.getTextStyle(), REGULAR_STYLE)){
                temp.append(atom.getData());
                if(temp.length() > 5){
                    tgtAtom.setData(translator.trans(temp.toString()));
                    temp = new StringBuilder();
                }else {
                    // 不满足时要将内容置空
                    tgtAtom.setData("");
                }
                tgtAtom.setTagId(atom.getTagId());
                tgtAtom.setTagType(atom.getTagType());
                tgtAtom.setTextStyle(atom.getTextStyle());
                tgtAtom.setHidden(atom.isHidden());
                tgtAtom.setAtomId(atom.getAtomId());
                tgtSegmentAtoms.add(tgtAtom);
                // count用来记录每次文本atom的位置,循环结束后count的值就是最后一个文本atom的位置
                count = i;
                continue;
            }
            tgtAtom.setData(atom.getData());
            tgtAtom.setTagId(atom.getTagId());
            tgtAtom.setTagType(atom.getTagType());
            tgtAtom.setTextStyle(atom.getTextStyle());
            tgtAtom.setHidden(atom.isHidden());
            tgtAtom.setAtomId(atom.getAtomId());
            tgtSegmentAtoms.add(tgtAtom);
        }

        // temp还存在残余内容,要找到tgtAtoms中最后一个文本atom将其内容替换
        if(StringUtils.isNotEmpty(temp.toString()) && !temp.toString().matches("\\s+")){
            Atom tempAtom = tgtSegmentAtoms.get(count);
            tempAtom.setData(translator.trans(temp.toString()));
            tgtSegmentAtoms.set(count, tempAtom);
        }
        return tgtSegmentAtoms;
    }
}
  • 翻译接口
public interface Translator {
    String trans(String sentence);
}

results matching ""

    No results matching ""