<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Tpoxa&#039;s blog &#187; XSLT</title>
	<atom:link href="http://tpoxa.com/category/xslt/feed/" rel="self" type="application/rss+xml" />
	<link>http://tpoxa.com</link>
	<description></description>
	<lastBuildDate>Sun, 19 Jun 2011 21:11:15 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.3.1</generator>
		<item>
		<title>market.yandex.ru парсер</title>
		<link>http://tpoxa.com/2008/09/09/marketyandexru-parser/</link>
		<comments>http://tpoxa.com/2008/09/09/marketyandexru-parser/#comments</comments>
		<pubDate>Tue, 09 Sep 2008 18:48:36 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[PHP]]></category>
		<category><![CDATA[XSLT]]></category>

		<guid isPermaLink="false">http://tpoxa.com/?p=28</guid>
		<description><![CDATA[Недавно появилась задача парсить странички. Точнее market.yandex.ru. Собрать все описания в готовом к вставке в базу виде а также доступные изображения к товару. Исторически сложилось что с регулярками у меня хуже чем с XSLT. Все хорошо но маркет не дает валидный код и парсить его напрямую у меня не получилось. На помощь пришел Tidy. &#60;?php [...]]]></description>
			<content:encoded><![CDATA[<p>Недавно появилась задача парсить странички. Точнее market.yandex.ru.</p>
<p>Собрать все описания в готовом к вставке в базу виде а также доступные изображения к товару.</p>
<p>Исторически сложилось что с регулярками у меня хуже чем с XSLT.</p>
<p>Все хорошо но маркет не дает валидный код и парсить его напрямую у меня не получилось.</p>
<p>На помощь пришел Tidy.</p>
<p><span id="more-28"></span></p>


<div class="wp-geshi-highlight-wrap5"><div class="wp-geshi-highlight-wrap4"><div class="wp-geshi-highlight-wrap3"><div class="wp-geshi-highlight-wrap2"><div class="wp-geshi-highlight-wrap"><div class="wp-geshi-highlight"><div class="php"><pre class="de1"><span class="kw2">&lt;?php</span>
&nbsp;
<span class="re0">$YMI</span><span class="sy0">=</span><span class="br0">&#40;</span><span class="kw3">isset</span><span class="br0">&#40;</span><span class="re0">$_GET</span><span class="br0">&#91;</span><span class="st_h">'ymid'</span><span class="br0">&#93;</span><span class="br0">&#41;</span><span class="br0">&#41;</span> ? <span class="kw3">urldecode</span><span class="br0">&#40;</span><span class="re0">$_GET</span><span class="br0">&#91;</span><span class="st_h">'ymid'</span><span class="br0">&#93;</span><span class="br0">&#41;</span><span class="sy0">:</span> <span class="st0">&quot;&quot;</span><span class="sy0">;</span> <span class="co1">// переменная хранит адрес страницы с описанием на Яндексе</span>
<span class="re0">$content</span> <span class="sy0">=</span> <span class="kw3">file_get_contents</span><span class="br0">&#40;</span><span class="re0">$YMI</span><span class="br0">&#41;</span><span class="sy0">;</span>
&nbsp;
<span class="co1">// $content = iconv(&quot;CP1251&quot;, &quot;UTF-8//IGNORE&quot;, $content); // это строка ненужна така Яндекс маркет перешел на UTF-8</span>
&nbsp;
<span class="re0">$config</span> <span class="sy0">=</span> <span class="kw3">array</span><span class="br0">&#40;</span>
<span class="st0">&quot;indent&quot;</span>        <span class="sy0">=&gt;</span> <span class="kw4">true</span><span class="sy0">,</span>
<span class="st0">&quot;output-xml&quot;</span>    <span class="sy0">=&gt;</span> <span class="kw4">true</span><span class="sy0">,</span>
<span class="st0">&quot;wrap&quot;</span>          <span class="sy0">=&gt;</span> <span class="nu0">200</span><span class="br0">&#41;</span><span class="sy0">;</span>
&nbsp;
<span class="re0">$tidy</span> <span class="sy0">=</span> <span class="kw2">new</span> tidy<span class="sy0">;</span>
<span class="re0">$tidy</span><span class="sy0">-&gt;</span><span class="me1">parseString</span><span class="br0">&#40;</span><span class="re0">$content</span><span class="sy0">,</span> <span class="re0">$config</span><span class="sy0">,</span> <span class="st0">&quot;utf8&quot;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="re0">$tidy</span><span class="sy0">-&gt;</span><span class="me1">cleanRepair</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
&nbsp;
<span class="re0">$xml</span><span class="sy0">=</span><span class="kw3">simplexml_load_string</span> <span class="br0">&#40;</span><span class="re0">$tidy</span><span class="br0">&#41;</span><span class="sy0">;</span>
&nbsp;
<span class="re0">$groups</span> <span class="sy0">=</span> <span class="re0">$xml</span><span class="sy0">-&gt;</span><span class="me1">xpath</span><span class="br0">&#40;</span><span class="st0">&quot;//table[@class=&quot;</span>modelProperties<span class="st0">&quot;]//tr/td[@class=&quot;</span>title<span class="st0">&quot;]/b&quot;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="re0">$strings</span> <span class="sy0">=</span> <span class="re0">$xml</span><span class="sy0">-&gt;</span><span class="me1">xpath</span><span class="br0">&#40;</span><span class="st0">&quot;//table[@class=&quot;</span>modelProperties<span class="st0">&quot;]//tr/td[@class=&quot;</span>title<span class="st0">&quot;]/b|//table[@class=\'modelProperties\']//td[@class=\'label\']/span/text()|//table[@class=\'modelProperties\']//tr/td[position()=2]/text()&quot;</span><span class="br0">&#41;</span><span class="sy0">;</span>
&nbsp;
<span class="re0">$results_groups</span><span class="sy0">=</span><span class="kw3">array</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="re0">$results</span><span class="sy0">=</span><span class="kw3">array</span><span class="br0">&#40;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="re0">$i</span><span class="sy0">=</span><span class="nu0">0</span><span class="sy0">;</span>
<span class="re0">$GI</span><span class="sy0">=</span><span class="nu0">0</span><span class="sy0">;</span>
&nbsp;
<span class="co1">//print_r($strings);</span>
<span class="kw1">while</span><span class="br0">&#40;</span><span class="re0">$i</span><span class="sy0">&lt;</span><span class="kw3">sizeof</span> <span class="br0">&#40;</span><span class="re0">$strings</span><span class="br0">&#41;</span><span class="br0">&#41;</span>
<span class="br0">&#123;</span>
<span class="kw1">if</span><span class="br0">&#40;</span><span class="kw3">in_array</span><span class="br0">&#40;</span><span class="re0">$strings</span><span class="br0">&#91;</span><span class="re0">$i</span><span class="br0">&#93;</span><span class="sy0">,</span><span class="re0">$groups</span><span class="br0">&#41;</span><span class="br0">&#41;</span>
<span class="br0">&#123;</span>
<span class="re0">$GI</span><span class="sy0">=</span><span class="kw3">sizeof</span><span class="br0">&#40;</span><span class="re0">$results_groups</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="re0">$results_groups</span><span class="br0">&#91;</span><span class="br0">&#93;</span><span class="sy0">=</span><span class="br0">&#40;</span>string<span class="br0">&#41;</span><span class="re0">$strings</span><span class="br0">&#91;</span><span class="re0">$i</span><span class="br0">&#93;</span><span class="sy0">;</span>
<span class="re0">$results</span><span class="br0">&#91;</span><span class="re0">$GI</span><span class="br0">&#93;</span><span class="br0">&#91;</span><span class="br0">&#93;</span><span class="sy0">=</span><span class="kw3">array</span><span class="br0">&#40;</span><span class="br0">&#40;</span>string<span class="br0">&#41;</span><span class="re0">$strings</span><span class="br0">&#91;</span><span class="re0">$i</span><span class="sy0">+</span><span class="nu0">1</span><span class="br0">&#93;</span><span class="sy0">,</span><span class="br0">&#40;</span>string<span class="br0">&#41;</span><span class="re0">$strings</span><span class="br0">&#91;</span><span class="re0">$i</span><span class="sy0">+</span><span class="nu0">2</span><span class="br0">&#93;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="re0">$i</span><span class="sy0">=</span><span class="re0">$i</span><span class="sy0">+</span><span class="nu0">3</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
<span class="kw1">else</span>
<span class="br0">&#123;</span>
<span class="re0">$results</span><span class="br0">&#91;</span><span class="re0">$GI</span><span class="br0">&#93;</span><span class="br0">&#91;</span><span class="br0">&#93;</span><span class="sy0">=</span><span class="kw3">array</span><span class="br0">&#40;</span><span class="br0">&#40;</span>string<span class="br0">&#41;</span><span class="re0">$strings</span><span class="br0">&#91;</span><span class="re0">$i</span><span class="br0">&#93;</span><span class="sy0">,</span><span class="br0">&#40;</span>string<span class="br0">&#41;</span><span class="re0">$strings</span><span class="br0">&#91;</span><span class="re0">$i</span><span class="sy0">+</span><span class="nu0">1</span><span class="br0">&#93;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="re0">$i</span><span class="sy0">=</span><span class="re0">$i</span><span class="sy0">+</span><span class="nu0">2</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span>
&nbsp;
<span class="re0">$photos</span> <span class="sy0">=</span> <span class="re0">$xml</span><span class="sy0">-&gt;</span><span class="me1">xpath</span><span class="br0">&#40;</span><span class="st0">&quot;//table[@class=&quot;</span>modelpict<span class="st0">&quot;]//a/@href&quot;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="kw1">if</span><span class="br0">&#40;</span><span class="kw3">empty</span><span class="br0">&#40;</span><span class="re0">$photos</span><span class="br0">&#41;</span><span class="br0">&#41;</span>
<span class="br0">&#123;</span>
<span class="re0">$photos</span> <span class="sy0">=</span> <span class="re0">$xml</span><span class="sy0">-&gt;</span><span class="me1">xpath</span><span class="br0">&#40;</span><span class="st0">&quot;//table[@class=&quot;</span>modelpict<span class="st0">&quot;]//img/@src&quot;</span><span class="br0">&#41;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
&nbsp;
<span class="kw1">foreach</span><span class="br0">&#40;</span> <span class="re0">$photos</span> <span class="kw1">as</span> <span class="re0">$photo</span><span class="br0">&#41;</span>
<span class="br0">&#123;</span>
<span class="kw1">print</span> <span class="st0">&quot;&lt;img src=<span class="es1">\&quot;</span>&quot;</span><span class="sy0">;</span>
<span class="kw1">print</span> <span class="br0">&#40;</span>string<span class="br0">&#41;</span> <span class="re0">$photo</span><span class="sy0">;</span>
<span class="kw1">print</span> <span class="st0">&quot;<span class="es1">\&quot;</span> /&gt;&quot;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
&nbsp;
<span class="kw1">print</span> <span class="st0">&quot;&lt;hr /&gt;&lt;table&gt;&lt;caption&gt;Технические характеристики&lt;/caption&gt;&quot;</span><span class="sy0">;</span>
&nbsp;
<span class="kw1">foreach</span><span class="br0">&#40;</span><span class="re0">$results_groups</span> <span class="kw1">as</span> <span class="re0">$k</span><span class="sy0">=&gt;</span><span class="re0">$v</span><span class="br0">&#41;</span>
<span class="br0">&#123;</span>
<span class="kw1">print</span> <span class="st0">&quot;&lt;tr&gt;&lt;td colspan=<span class="es1">\&quot;</span>2<span class="es1">\&quot;</span> align=<span class="es1">\&quot;</span>center<span class="es1">\&quot;</span> style=<span class="es1">\&quot;</span>background-color:#ccc<span class="es1">\&quot;</span>&gt;&quot;</span><span class="sy0">.</span><span class="re0">$v</span><span class="sy0">.</span><span class="st0">&quot;&lt;/td&gt;&lt;/tr&gt;&quot;</span><span class="sy0">;</span>
&nbsp;
<span class="kw1">foreach</span><span class="br0">&#40;</span><span class="re0">$results</span><span class="br0">&#91;</span><span class="re0">$k</span><span class="br0">&#93;</span> <span class="kw1">as</span> <span class="re0">$arr</span><span class="br0">&#41;</span>
<span class="br0">&#123;</span>
<span class="kw1">print</span> <span class="st0">&quot;&lt;tr&gt;&lt;td&gt;&quot;</span><span class="sy0">.</span><span class="re0">$arr</span><span class="br0">&#91;</span><span class="nu0">0</span><span class="br0">&#93;</span><span class="sy0">.</span><span class="st0">&quot;&lt;/td&gt;&lt;td&gt;&quot;</span><span class="sy0">.</span><span class="re0">$arr</span><span class="br0">&#91;</span><span class="nu0">1</span><span class="br0">&#93;</span><span class="sy0">.</span><span class="st0">&quot;&lt;/td&gt;&lt;/tr&gt;&quot;</span><span class="sy0">;</span>
<span class="br0">&#125;</span>
<span class="br0">&#125;</span>
<span class="kw1">print</span> <span class="st0">&quot;&lt;/table&gt;&quot;</span><span class="sy0">;</span>
&nbsp;
<span class="sy1">?&gt;</span></pre></div></div></div></div></div></div></div>


]]></content:encoded>
			<wfw:commentRss>http://tpoxa.com/2008/09/09/marketyandexru-parser/feed/</wfw:commentRss>
		<slash:comments>18</slash:comments>
		</item>
	</channel>
</rss>

