Get XPath for all td in a table using Pentaho
Is there anyway using Pentaho to parse a tables td's from an html page? Lets say 开发者_StackOverflow社区I have this html content
<html>
<body>
<table>
<tr>
<td>info1</td>
<td>info2</td>
</tr>
<tr>
<td>info3</td>
<td>info4</td>
</tr>
</table>
</body>
</html>
I am using in Pentaho the "Get data from XML" with the following data:
Content::
Loop XPath: /html/body/table/tr
Fields::
Name: tableData
XPath: td
The data information I would like to have is
info1 info2 info3 info4
in any kind of way.
Any help would be truly appreciated!I solved it by making reading every row of my file as rows. Then I added a Pentaho step "User Defined Java Class" and made it parse my table content using XSLT to a new XML file. Using that XML I was able to get the data needed to complete the task.
Here is what I wrote in "User Defined Java Class":
import java.util.*;
import java.io.FileOutputStream;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
private int infilenameIndex;
private int xsltfilenameIndex;
private int outfilenameIndex;
public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
Object[] r=getRow();
if (r==null) {
setOutputDone();
return false;
}
if (first == false) {
infilenameIndex = getInputRowMeta().indexOfValue(getParameter("infilename"));
if (infilenameIndex < 0) {
throw new KettleException("Field not found in the input row, check parameter 'infilename'!");
}
xsltfilenameIndex = getInputRowMeta().indexOfValue(getParameter("xsltfilename"));
if (xsltfilenameIndex < 0) {
throw new KettleException("Field not found in the input row, check parameter 'xsltfilename'!");
}
outfilenameIndex = getInputRowMeta().indexOfValue(getParameter("outfilename"));
if (outfilenameIndex < 0) {
throw new KettleException("Field not found in the input row, check parameter 'outfilename'!");
}
first=false;
}
String infilename = get(Fields.In, "infilename").getString(r);
String xsltfilename = get(Fields.In, "xsltfilename").getString(r);
String outfilename = get(Fields.In, "outfilename").getString(r);
Object[] outputRowData = RowDataUtil.resizeArray(r, data.outputRowMeta.size());
int outputIndex = getInputRowMeta().size();
transform(infilename, xsltfilename, outfilename);
putRow(data.outputRowMeta, outputRowData);
return true;
}
public void transform(String infilename, String xsltfilename, String outfilename) throws KettleException {
javax.xml.transform.stream.StreamSource inss = null;
javax.xml.transform.stream.StreamSource xsltss = null;
javax.xml.transform.stream.StreamResult outss = null;
logBasic("");
logBasic("Transformerar " + infilename + " med " + xsltfilename + " till " + outfilename );
logBasic("");
try {
inss = new javax.xml.transform.stream.StreamSource(infilename);
}
catch (Exception e) {
logError("Infil saknas " + infilename);
throw new KettleException(e);
}
try {
xsltss = new javax.xml.transform.stream.StreamSource(xsltfilename);
}
catch (Exception e) {
logError("Xsltfil saknas " + xsltfilename);
throw new KettleException(e);
}
try {
outss = new javax.xml.transform.stream.StreamResult(outfilename);
}
catch (Exception e) {
logError("Outfil saknas " + outfilename);
throw new KettleException(e);
}
try {
TransformerFactory tFactory = TransformerFactory.newInstance();
// Set the TransformerFactory to the SAXON implementation.
//tFactory = new net.sf.saxon.TransformerFactoryImpl();
Transformer transformer = tFactory.newTransformer(xsltss);
// Do the transfromtation
transformer.transform(inss, outss);
}
catch (Exception e) {
throw new KettleException(e);
}
return;
}
saw this. for anyone who is coming here now. parsing to xml can be done with jsoup using the appropriate path. its a simple plugin and works in the user defined class with whatever other methods you have. it is a css selector.
精彩评论