Convert HTML DOM into a multidimensional array
Where keys are represent开发者_运维问答ed by element type and values are represented by #foo
and .bar
(spaced and ready for explode()
). Is it possible, or does something exist for it?
I know that this question might incite some wrath, and I'm hoping nobody links to that post about parsing HTML, but I'm hoping it's not impossible. Thanks for the help.
Addendum: Ideally, PHP would be used, since it's the only scripting language I know.
Thanks for all the help :\ This function will convert a body of html into a multidimensional array that contains attributes, classes and ids.
<?php
function htmlArrayer($raw_html){
$match_open = '/\<(?!\/)(.+?)\>/';
$match_closed = '/\<\/(.+?)\>/';
$match_open_or_closed = '/(\<(\/?[^\>]+)\>)/';
$match_scripts = '@<script[^>]*?>.*?</script>@si';
$match_styles = '@<style[^>]*?>.*?</style>@siU';
$match_element = '/(?<=\<\s*)[a-zA-Z](?=\s+)/';
$match_comments = '/<!--.*?-->/si';
$match_class = '/(?<=(class\=")).+?(?=")/';
$match_id = '/(?<=(id\=")).+?(?=")/';
$raw_html = preg_replace($match_scripts, '', $raw_html);
$raw_html = preg_replace($match_styles, '', $raw_html);
$raw_html = preg_replace($match_comments, '', $raw_html);
$raw_html = str_replace('>', '> ', $raw_html);
$raw_html = str_replace('<', ' <', $raw_html);
$raw_html = str_replace('!--', '!-- ', $raw_html);
$raw_html = preg_replace('/[ \t\r\n]/', ' ', $raw_html);
preg_match_all($match_open_or_closed, $raw_html, $matches);
$matches[2] = checkTags($matches[2]);
$html_array = htmlToArray($matches[2], 0);
return $html_array;
}
function checkTags($htmlArray) {
$valid_tags_array = array('html', 'body', 'div', 'span', 'applet', 'object', 'iframe', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'blockquote', 'pre', 'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'code', 'del', 'dfn', 'em', 'font', 'img', 'ins', 'kbd', 'q', 's', 'samp', 'small', 'strike', 'strong', 'sub', 'sup', 'tt', 'var', 'b', 'u', 'i', 'center', 'dl', 'dt', 'dd', 'ol', 'ul', 'li','fieldset', 'form', 'label', 'legend', 'table', 'caption', 'tbody', 'tfoot', 'thead', 'tr', 'th', 'td');
foreach($htmlArray as $key => $element) {
$notfound = true;
$element = explode(' ', trim($element));
foreach($valid_tags_array as $tag) {
if($tag == $element[0] || '/' . $tag == $element[0]){
$notfound = false;
break;
}
}
if($notfound != false){
$htmlArray[$key] = 'br';
}
}
return $htmlArray;
}
function htmlToArray($untiered_array, $index){
$untiered_element = explode(' ', $untiered_array[$index]);
if($untiered_element[0] == 'br'){
$index++;
$untiered_element = explode(' ', $untiered_array[$index]);
}
$css_string = attrToCSS($untiered_array[$index]);
$untiered_array[$index] = $untiered_element[0] . ' ' . $css_string;
$new_array_layer = array($untiered_array[$index]);
$tier_check = 0;
// Loops through every remaining element from the $index forward
for($i = $index + 1; $untiered_array[$i] != '/' . $untiered_element[0] || $tier_check != 0; $i++){
$one_way_elements = array('br', 'img', 'area', 'base', 'basefront', 'hr', 'input', 'link', 'meta', 'col', 'embed', 'param');
$element_check = true;
$next_element_name = explode(' ', $untiered_array[$i]);
foreach($one_way_elements as $this_element){
if($this_element == $next_element_name[0]){
$element_check = false;
break;
}
}
// if it *is* the self-closing type, create a 1d array for it.
if($element_check == false) {
$tier_check++;
if($tier_check == 1) {
$untiered_standalone = explode(' ', $untiered_array[$i]);
$css_string = attrToCSS($untiered_array[$i]);
$untiered_array[$i] = $untiered_standalone[0] . ' ' . $css_string;
$new_array_layer[] = array($untiered_array[$i]);
}
$tier_check--;
}
// If the following element is not preceded by a '/' and is not self-closing, continue
if((strpos($untiered_array[$i], '/') != 0 || strpos($untiered_array[$i], '/') === false) && $element_check == true){
$tier_check++;
// If the next element is only one tier above this element (as in its direct child), reiterate
if($tier_check == 1){
$new_array_layer[] = htmlToArray($untiered_array, $i);
}
}
// If the next element *does* begin with a closing slash
if(strpos($untiered_array[$i], '/') === 0){
$tier_check--;
}
}
return $new_array_layer;
}
function attrToCSS($attr_string){
preg_match_all('/(?<=(class\=")).+?(?=")/', $attr_string, $class_value);
$class_value_string = $class_value[0][0];
preg_match_all('/(?<=(id\=")).+?(?=")/', $attr_string, $id_value);
$id_value_string = $id_value[0][0];
if($class_value_string != ''){
$class_value_array = explode(' ', $class_value_string);
foreach($class_value_array as $index => $class) {
$class_value_array[$index] = '.' . $class;
}
$class_id_string = implode(' ', $class_value_array);
}
if ($id_value_string != '') {
$class_id_string = '#' . $id_value_string;
}
return $class_id_string;
}
?>
精彩评论