html truncator in java

2022-12-23 15:28 问答作者：

Is there any utility (or sample source code) that truncates HTML (for preview) in Java? I want to do the truncation on the server and not on 开发者_如何转开发the client.

I'm using HTMLUnit to parse HTML.

UPDATE:

I want to be able to preview the HTML, so the truncator would maintain the HTML structure while stripping out the elements after the desired output length.

I've written another java version of truncateHTML. This function truncates a string up to a number of characters while preserving whole words and HTML tags.

public static String truncateHTML(String text, int length, String suffix) {
    // if the plain text is shorter than the maximum length, return the whole text
    if (text.replaceAll("<.*?>", "").length() <= length) {
        return text;
    }
    String result = "";
    boolean trimmed = false;
    if (suffix == null) {
        suffix = "...";
    }

    /*
     * This pattern creates tokens, where each line starts with the tag.
     * For example, "One, <b>Two</b>, Three" produces the following:
     *     One,
     *     <b>Two
     *     </b>, Three
     */
    Pattern tagPattern = Pattern.compile("(<.+?>)?([^<>]*)");

    /*
     * Checks for an empty tag, for example img, br, etc.
     */
    Pattern emptyTagPattern = Pattern.compile("^<\\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param).*>$");

    /*
     * Modified the pattern to also include H1-H6 tags
     * Checks for closing tags, allowing leading and ending space inside the brackets
     */
    Pattern closingTagPattern = Pattern.compile("^<\\s*/\\s*([a-zA-Z]+[1-6]?)\\s*>$");

    /*
     * Modified the pattern to also include H1-H6 tags
     * Checks for opening tags, allowing leading and ending space inside the brackets
     */
    Pattern openingTagPattern = Pattern.compile("^<\\s*([a-zA-Z]+[1-6]?).*?>$");

    /*
     * Find &nbsp; &gt; ...
     */
    Pattern entityPattern = Pattern.compile("(&[0-9a-z]{2,8};|&#[0-9]{1,7};|[0-9a-f]{1,6};)");

    // splits all html-tags to scanable lines
    Matcher tagMatcher =  tagPattern.matcher(text);
    int numTags = tagMatcher.groupCount();

    int totalLength = suffix.length();
    List<String> openTags = new ArrayList<String>();

    boolean proposingChop = false;
    while (tagMatcher.find()) {
        String tagText = tagMatcher.group(1);
        String plainText = tagMatcher.group(2);

        if (proposingChop &&
                tagText != null && tagText.length() != 0 &&
                plainText != null && plainText.length() != 0) {
            trimmed = true;
            break;
        }

        // if there is any html-tag in this line, handle it and add it (uncounted) to the output
        if (tagText != null && tagText.length() > 0) {
            boolean foundMatch = false;

            // if it's an "empty element" with or without xhtml-conform closing slash
            Matcher matcher = emptyTagPattern.matcher(tagText);
            if (matcher.find()) {
                foundMatch = true;
                // do nothing
            }

            // closing tag?
            if (!foundMatch) {
                matcher = closingTagPattern.matcher(tagText);
                if (matcher.find()) {
                    foundMatch = true;
                    // delete tag from openTags list
                    String tagName = matcher.group(1);
                    openTags.remove(tagName.toLowerCase());
                }
            }

            // opening tag?
            if (!foundMatch) {
                matcher = openingTagPattern.matcher(tagText);
                if (matcher.find()) {
                    // add tag to the beginning of openTags list
                    String tagName = matcher.group(1);
                    openTags.add(0, tagName.toLowerCase());
                }
            }

            // add html-tag to result
            result += tagText;
        }

        // calculate the length of the plain text part of the line; handle entities (e.g. &nbsp;) as one character
        int contentLength = plainText.replaceAll("&[0-9a-z]{2,8};|&#[0-9]{1,7};|[0-9a-f]{1,6};", " ").length();
        if (totalLength + contentLength > length) {
            // the number of characters which are left
            int numCharsRemaining = length - totalLength;
            int entitiesLength = 0;
            Matcher entityMatcher = entityPattern.matcher(plainText);
            while (entityMatcher.find()) {
                String entity = entityMatcher.group(1);
                if (numCharsRemaining > 0) {
                    numCharsRemaining--;
                    entitiesLength += entity.length();
                } else {
                    // no more characters left
                    break;
                }
            }

            // keep us from chopping words in half
            int proposedChopPosition = numCharsRemaining + entitiesLength;
            int endOfWordPosition = plainText.indexOf(" ", proposedChopPosition-1);
            if (endOfWordPosition == -1) {
                endOfWordPosition = plainText.length();
            }
            int endOfWordOffset = endOfWordPosition - proposedChopPosition;
            if (endOfWordOffset > 6) { // chop the word if it's extra long
                endOfWordOffset = 0;
            }

            proposedChopPosition = numCharsRemaining + entitiesLength + endOfWordOffset;
            if (plainText.length() >= proposedChopPosition) {
                result += plainText.substring(0, proposedChopPosition);
                proposingChop = true;
                if (proposedChopPosition < plainText.length()) {
                    trimmed = true;
                    break; // maximum length is reached, so get off the loop
                }
            } else {
                result += plainText;
            }
        } else {
            result += plainText;
            totalLength += contentLength;
        }
        // if the maximum length is reached, get off the loop
        if(totalLength >= length) {
            trimmed = true;
            break;
        }
    }

    for (String openTag : openTags) {
        result += "</" + openTag + ">";
    }
    if (trimmed) {
        result += suffix;
    }
    return result;
}

I think you're going to need to write your own XML parser to accomplish this. Pull out the body node, add nodes until binary length < some fixed size, and then rebuild the document. If HTMLUnit doesn't create semantic XHTML, I'd recommend tagsoup.

If you need an XML parser/handler, I'd recommend XOM.

There is a PHP function that does it here: http://snippets.dzone.com/posts/show/7125

I've made a quick and dirty Java port of the initial version, but there are subsequent improved versions in the comments that could be worth considering (especially one that deals with whole words):

public static String truncateHtml(String s, int l) {
  Pattern p = Pattern.compile("<[^>]+>([^<]*)");

  int i = 0;
  List<String> tags = new ArrayList<String>();

  Matcher m = p.matcher(s);
  while(m.find()) {
      if (m.start(0) - i >= l) {
          break;
      }

      String t = StringUtils.split(m.group(0), " \t\n\r\0\u000B>")[0].substring(1);
      if (t.charAt(0) != '/') {
          tags.add(t);
      } else if ( tags.get(tags.size()-1).equals(t.substring(1))) {
          tags.remove(tags.size()-1);
      }
      i += m.start(1) - m.start(0);
  }

  Collections.reverse(tags);
  return s.substring(0, Math.min(s.length(), l+i))
      + ((tags.size() > 0) ? "</"+StringUtils.join(tags, "></")+">" : "")
      + ((s.length() > l) ? "\u2026" : "");

}

Note: You'll need Apache Commons Lang for the StringUtils.join().

I can offer you a Python script I wrote to do this: http://www.ellipsix.net/ext-tmp/summarize.txt. Unfortunately I don't have a Java version, but feel free to translate it yourself and modify it to suit your needs if you want. It's not very complicated, just something I hacked together for my website, but I've been using it for a little more than a year and it generally seems to work pretty well.

If you want something robust, an XML (or SGML) parser is almost certainly a better idea than what I did.

I found this blog: dencat: Truncating HTML in Java

It contains a java port of Pythons, Django template function truncate_html_words

public class SimpleHtmlTruncator {

    public static String truncateHtmlWords(String text, int max_length) {
        String input = text.trim();
        if (max_length > input.length()) {
            return input;
        }
        if (max_length < 0) {
            return new String();
        }
        StringBuilder output = new StringBuilder();
        /**
         * Pattern pattern_opentag = Pattern.compile("(<[^/].*?[^/]>).*");
         * Pattern pattern_closetag = Pattern.compile("(</.*?[^/]>).*"); Pattern
         * pattern_selfclosetag = Pattern.compile("(<.*?/>).*");*
         */
        String HTML_TAG_PATTERN = "<(\"[^\"]*\"|'[^']*'|[^'\">])*>";
        Pattern pattern_overall = Pattern.compile(HTML_TAG_PATTERN + "|" + "\\s*\\w*\\s*");
        Pattern pattern_html = Pattern.compile("(" + HTML_TAG_PATTERN + ")" + ".*");
        Pattern pattern_words = Pattern.compile("(\\s*\\w*\\s*).*");
        int characters = 0;
        Matcher all = pattern_overall.matcher(input);
        while (all.find()) {
            String matched = all.group();
            Matcher html_matcher = pattern_html.matcher(matched);
            Matcher word_matcher = pattern_words.matcher(matched);
            if (html_matcher.matches()) {
                output.append(html_matcher.group());
            } else if (word_matcher.matches()) {
                if (characters < max_length) {
                    String word = word_matcher.group();
                    if (characters + word.length() < max_length) {
                        output.append(word);
                    } else {
                        output.append(word.substring(0,
                                (max_length - characters) > word.length()
                                ? word.length() : (max_length - characters)));
                    }
                    characters += word.length();
                }
            }
        }
        return output.toString();
    }

    public static void main(String[] args) {
        String text = SimpleHtmlTruncator.truncateHtmlWords("<html><body><br/><p>abc</p><p>defghij</p><p>ghi</p></body></html>", 4);
        System.out.println(text);
    }
}

继续阅读：parsing truncate

html truncator in java

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？