Jump to content

Understanding Patterns in preg_list()


hakhaimo

Recommended Posts

Sample:

<?php

$htmltaglist = '/<\/?a(bbr|cronym|ddress|rea)?[^>]*>/i';
$htmltaglist .= ',/<\/?b(do|ig|lockquote|r|utton)?[^>]*>/i';
$htmltaglist .= ',/<\/?c(aption|enter|ite|o(de|l(group)?){1}){1}[^>]*>/i';
$htmltaglist .= ',/<\/?d(d|el|fn|iv|l|t){1}[^>]*>/i';
$htmltaglist .= ',/<\/?em[^>]*>/i';
$htmltaglist .= ',/<\/?font?[^>]*>/i';
$htmltaglist .= ',/<\/?h(1|2|3|4|5|6|r){1}[^>]*>/i';
$htmltaglist .= ',/<\/?i(ns)?[^>]*>/i';
$htmltaglist .= ',/<\/?kbd[^>]*>/i';
$htmltaglist .= ',/<\/?map[^>]*>/i';
$htmltaglist .= ',/<\/?noscript[^>]*>/i';
$htmltaglist .= ',/<\/?o(bject|l){1}[^>]*>/i';
$htmltaglist .= ',/<\/?p(aram|re)?[^>]*>/i';
$htmltaglist .= ',/<\/?q[^>]*>/i';
$htmltaglist .= ',/<\/?s(amp|cript|mall|pan|trong|u(b|p){1}){1}[^>]*>/i';
$htmltaglist .= ',/<\/?t(able|body|d|head|r|t){1}[^>]*>/i';
$htmltaglist .= ',/<\/?u(l)?[^>]*>/i';
$htmltaglist .= ',/<\/?var[^>]*>/i';
//$htmltaglist .= ',/<\/?()?[^>]*>/i';
//$htmltaglist .= ',/<\/?()?[^>]*>/i';
$htmltags = split(',', $htmltaglist);
//print_r($htmltags);

$sHTML = "starting text before first tag<p>Sentence with > invalid & characters<sup>10</sup><.</p><p>This's a \"test\".</p><a name=\"test\">Test</a>.
<goober>This is not a valid tag</goober
><p><strong><gobber>More valid stuff</gobber></strong></p><acronym title=\"test\">Test Acronym</acronym>
<cite>Test Cite</cite><colgroup>text might appear between valid tags<h1>Heading</h1> text might appear after tags";

$regex = '/<\/?\w+[^>]*>/';

echo $sHTML."\n\n";

$arrayHTML = preg_split($regex, $sHTML, -1, PREG_SPLIT_OFFSET_CAPTURE);
//print_r($arrayHTML);
$nIndex = 0;
$sNewString = "";
$nElements = count($arrayHTML);

for($i = 0; $i < $nElements; $i++) {

$value = $arrayHTML[$i];

// Retrieve tag
$tag = substr($sHTML, $nIndex, $value[1] - $nIndex);
echo $tag."\r\n";

// check for valid tags here
$found = false;
for ($j=0; $j<count($htmltags); $j++) {
//echo $j.': ('.$htmltags[$j].')<br>';
if (preg_match($htmltags[$j], $tag)) {
$found = true;
break;
}
}
if (!$found) {
$tag = str_replace('"', '"', $tag);
$tag = str_replace('\'', ''', $tag);
$tag = str_replace('<', '<', $tag);
$tag = str_replace('>', '>', $tag);
}
// convert html entities in string section
// at this point we are only interested in < > " and '
$text = $value[0];
$text = str_replace('"', '"', $text);
$text = str_replace('\'', ''', $text);
$text = str_replace('<', '<', $text);
$text = str_replace('>', '>', $text);

$sNewString .= $tag.$text;

$nIndex = $value[1] + strlen($value[0]);

if(($i + 1) == $nElements) {
$sNewString .= substr($sHTML, $nIndex, strlen($sHTML) - $nIndex);
}
}

echo $sNewString;

echo "-----------------------------------------------------------------";

//echo $arrayHTML[0];
echo $nElements;
printf($arrayHTML);
/*for($x=0; $x<$nElements; $x++)
{
echo $arrayHTML[$x]."<br>";
}*/
?>

 

May I ask what does the patterns or the involved characters in a pattern?

 

I Know that for example:

 

$htmltaglist .= ',/<\/?c(aption|enter|ite|o(de|l(group)?){1}){1}[^>]*>/i';

 

refers to a tag

caption

center

cite

 

what does this mean?

 

?, V, ,"o(de|l(group)?){1}){1}[^>]*>" means and the others?

 

I Really need this badly. Thank You very much!

Edit/Delete Message

Link to comment
https://forums.phpfreaks.com/topic/120599-understanding-patterns-in-preg_list/
Share on other sites

Please review regex.

NODE                    EXPLANATION

----------------------------------------------------------------------

  <                        '<'

----------------------------------------------------------------------

  /?                      '/' (optional (matching the most amount

                          possible))

----------------------------------------------------------------------

  c                        'c'

----------------------------------------------------------------------

  (                        group and capture to \1 (1 times):

----------------------------------------------------------------------

    aption                  'aption'

----------------------------------------------------------------------

  |                        OR

----------------------------------------------------------------------

    enter                    'enter'

----------------------------------------------------------------------

  |                        OR

----------------------------------------------------------------------

    ite                      'ite'

----------------------------------------------------------------------

  |                        OR

----------------------------------------------------------------------

    o                        'o'

----------------------------------------------------------------------

    (                        group and capture to \2 (1 times):

----------------------------------------------------------------------

      de                      'de'

----------------------------------------------------------------------

    |                        OR

----------------------------------------------------------------------

      l                        'l'

----------------------------------------------------------------------

      (                        group and capture to \3 (optional

                              (matching the most amount possible)):

----------------------------------------------------------------------

        group                    'group'

----------------------------------------------------------------------

      )?                      end of \3 (NOTE: because you're using

                              a quantifier on this capture, only the

                              LAST repetition of the captured

                              pattern will be stored in \3)

----------------------------------------------------------------------

    ){1}                    end of \2 (NOTE: because you're using a

                            quantifier on this capture, only the

                            LAST repetition of the captured pattern

                            will be stored in \2)

----------------------------------------------------------------------

  ){1}                    end of \1 (NOTE: because you're using a

                          quantifier on this capture, only the LAST

                          repetition of the captured pattern will be

                          stored in \1)

----------------------------------------------------------------------

  [^>]*                    any character except: '>' (0 or more times

                          (matching the most amount possible))

----------------------------------------------------------------------

  >                        '>'

----------------------------------------------------------------------

Archived

This topic is now archived and is closed to further replies.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.