开发者

Get XPath for all td in a table using Pentaho

Is there anyway using Pentaho to parse a tables td's from an html page? Lets say 开发者_StackOverflow社区I have this html content

<html>
  <body>
    <table>
      <tr>
        <td>info1</td>
        <td>info2</td>
      </tr>
      <tr>
        <td>info3</td>
        <td>info4</td>
      </tr>
    </table>
  </body>
</html>
I am using in Pentaho the "Get data from XML" with the following data:

Content::
Loop XPath: /html/body/table/tr
Fields::
Name: tableData
XPath: td
The data information I would like to have is
info1 info2 info3 info4
in any kind of way.

Any help would be truly appreciated!


I solved it by making reading every row of my file as rows. Then I added a Pentaho step "User Defined Java Class" and made it parse my table content using XSLT to a new XML file. Using that XML I was able to get the data needed to complete the task.
Here is what I wrote in "User Defined Java Class":


import java.util.*;
import java.io.FileOutputStream;

import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;

private int infilenameIndex;
private int xsltfilenameIndex;
private int outfilenameIndex;


public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
  Object[] r=getRow();
  if (r==null) {
    setOutputDone();
    return false;
  }


  if (first == false) {
     infilenameIndex = getInputRowMeta().indexOfValue(getParameter("infilename"));
     if (infilenameIndex < 0) {
         throw new KettleException("Field not found in the input row, check parameter 'infilename'!");
     }
     xsltfilenameIndex = getInputRowMeta().indexOfValue(getParameter("xsltfilename"));
     if (xsltfilenameIndex < 0) {
         throw new KettleException("Field not found in the input row, check parameter 'xsltfilename'!");
     }
     outfilenameIndex = getInputRowMeta().indexOfValue(getParameter("outfilename"));
     if (outfilenameIndex < 0) {
         throw new KettleException("Field not found in the input row, check parameter 'outfilename'!");
     }

     first=false;
  }

  String infilename = get(Fields.In, "infilename").getString(r);
  String xsltfilename = get(Fields.In, "xsltfilename").getString(r);
  String outfilename = get(Fields.In, "outfilename").getString(r);

  Object[] outputRowData = RowDataUtil.resizeArray(r, data.outputRowMeta.size());
  int outputIndex = getInputRowMeta().size();

  transform(infilename, xsltfilename, outfilename);


  putRow(data.outputRowMeta, outputRowData);

  return true;
}
public void transform(String infilename, String xsltfilename, String outfilename) throws KettleException {

    javax.xml.transform.stream.StreamSource inss = null;
    javax.xml.transform.stream.StreamSource xsltss = null;
    javax.xml.transform.stream.StreamResult outss = null;

    logBasic("");
    logBasic("Transformerar " +  infilename + " med " + xsltfilename + " till " + outfilename );
    logBasic("");

    try {
       inss = new javax.xml.transform.stream.StreamSource(infilename);
    }     
    catch (Exception e) {
       logError("Infil saknas " +  infilename);
       throw new KettleException(e);
    }

    try {
       xsltss = new javax.xml.transform.stream.StreamSource(xsltfilename);
    }     
    catch (Exception e) {
       logError("Xsltfil saknas " +  xsltfilename);
       throw new KettleException(e);
    }

    try {
       outss = new javax.xml.transform.stream.StreamResult(outfilename);
    }     
    catch (Exception e) {
       logError("Outfil saknas " +  outfilename);
       throw new KettleException(e);
    }

    try {       
        TransformerFactory tFactory = TransformerFactory.newInstance();

        // Set the TransformerFactory to the SAXON implementation.
        //tFactory = new net.sf.saxon.TransformerFactoryImpl();

        Transformer transformer = tFactory.newTransformer(xsltss);

        // Do the transfromtation
        transformer.transform(inss, outss);
    }
    catch (Exception e) {
       throw new KettleException(e);
    }
    return;
}


saw this. for anyone who is coming here now. parsing to xml can be done with jsoup using the appropriate path. its a simple plugin and works in the user defined class with whatever other methods you have. it is a css selector.

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜