0

I have function to scrape an image from html webpage and this is html source code that I want to scrape

<div class="single-post-thumb">
        <img width="448" height="298" src="http://www.website.com/wp-content/uploads/2015/02/DSC_2803.jpg" class="attachment-660x330" alt="Description image" title="Description title" />      </div>

and this is my scrape function

public function process_individual_links($news_coll)
{       
    echo "Fetching Content - " . $news["news_url"]."". $news["news_images"] . "";   
    $news_coll = array_reverse($news_coll);
    //print_r($news_coll);
    foreach($news_coll as $news)
    {
        $news_url = $news["news_url"];
        $preview = $this->_http->request($news_url);
        $preview = $this->stripNewLine($preview);
    $expr = '#<div class="single-post-thumb"><img .*? src="(.*?)".*?/></div>.*?<div class="entry">(.*?)</div>#';
        preg_match_all($expr, $preview, $matches);
        $count = count($matches[0]) ;
        if($count == 0)
        {
            $expr = '#<div class="entry">(.*?)</div><!-- .entry /-->#';
            $news["news_images"] = str_replace('"', "", $match[1][0]);
            preg_match_all($expr, $preview, $matches);
            $news["news_content"] = $matches[1][0];
        }
        else
        {
            $news["news_images"] = str_replace('"', "", $match[1][0]);
            $news["news_content"] = $matches[2][0];
            echo" $news[news_images] ";
        }
        $imager = str_replace('"', "", $match[1][0]);
        $news["news_content"] = $news["news_content"] . "<p><a href='" . $news_url . "'>Sumber Berita</a></p>".$imager;
        if($this->insertIntoWordpress($news, "TNI") == "-1")                
            echo " ";           
        else                
            echo "Fetching Content - " . $news["news_url"]."". $news["news_images"] . "";
    }
}

I try in other site its work like this <img src=""> without height and width before src

I call this expression to scrape code

$expr = '#<div class="single-post-thumb"><img .*? src="(.*?)".*?/></div>.*?<div class="entry">(.*?)</div>#';
Cœur
  • 32,421
  • 21
  • 173
  • 232
  • Add `s (PCRE_DOTALL)` [flag](http://php.net/manual/en/reference.pcre.pattern.modifiers.php) after closing delimiter to make the dot also match newlines: ...`#s` and spaces `\s*` between tags, [see test at regex101](https://regex101.com/r/cC2kK1/3). Also note, that `img .*? src` would require 2 spaces if order is ` – Jonny 5 Feb 21 '15 at 14:13
  • I have try the code but still not work – Andy Nugroho Feb 21 '15 at 14:42
  • there is width="640" height="330" so – Andy Nugroho Feb 21 '15 at 15:07
  • Andy see [Regex FAQ](http://stackoverflow.com/questions/22937618/reference-what-does-this-regex-mean/22944075#22944075), replace that part with such as `width="\d*" height="\d*"` – Jonny 5 Feb 21 '15 at 15:10
  • Thanks jonny its work now...... – Andy Nugroho Feb 21 '15 at 15:41

0 Answers0