<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Tpoxa&#039;s blog &#187; XSLT</title>
	<atom:link href="http://tpoxa.com/category/xslt/feed/" rel="self" type="application/rss+xml" />
	<link>http://tpoxa.com</link>
	<description></description>
	<lastBuildDate>Tue, 17 Aug 2010 07:06:28 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.0.1</generator>
		<item>
		<title>market.yandex.ru парсер</title>
		<link>http://tpoxa.com/2008/09/09/marketyandexru-parser/</link>
		<comments>http://tpoxa.com/2008/09/09/marketyandexru-parser/#comments</comments>
		<pubDate>Tue, 09 Sep 2008 18:48:36 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[PHP]]></category>
		<category><![CDATA[XSLT]]></category>

		<guid isPermaLink="false">http://tpoxa.com/?p=28</guid>
		<description><![CDATA[Недавно появилась задача парсить странички. Точнее market.yandex.ru. Собрать все описания в готовом к вставке в базу виде а также доступные изображения к товару. Исторически сложилось что с регулярками у меня хуже чем с XSLT. Все хорошо но маркет не дает валидный код и парсить его напрямую у меня не получилось. На помощь пришел Tidy. &#60;?php [...]]]></description>
			<content:encoded><![CDATA[<p>Недавно появилась задача парсить странички. Точнее market.yandex.ru.</p>
<p>Собрать все описания в готовом к вставке в базу виде а также доступные изображения к товару.</p>
<p>Исторически сложилось что с регулярками у меня хуже чем с XSLT.</p>
<p>Все хорошо но маркет не дает валидный код и парсить его напрямую у меня не получилось.</p>
<p>На помощь пришел Tidy.</p>
<p><span id="more-28"></span></p>

<div class="wp_syntax"><div class="code"><pre class="php" style="font-family:monospace;"><span style="color: #000000; font-weight: bold;">&lt;?php</span>
&nbsp;
<span style="color: #000088;">$YMI</span><span style="color: #339933;">=</span><span style="color: #009900;">&#40;</span><span style="color: #990000;">isset</span><span style="color: #009900;">&#40;</span><span style="color: #000088;">$_GET</span><span style="color: #009900;">&#91;</span><span style="color: #0000ff;">'ymid'</span><span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span><span style="color: #009900;">&#41;</span> ? <span style="color: #990000;">urldecode</span><span style="color: #009900;">&#40;</span><span style="color: #000088;">$_GET</span><span style="color: #009900;">&#91;</span><span style="color: #0000ff;">'ymid'</span><span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">:</span> <span style="color: #0000ff;">&quot;&quot;</span><span style="color: #339933;">;</span> <span style="color: #666666; font-style: italic;">// переменная хранит адрес страницы с описанием на Яндексе</span>
<span style="color: #000088;">$content</span> <span style="color: #339933;">=</span> <span style="color: #990000;">file_get_contents</span><span style="color: #009900;">&#40;</span><span style="color: #000088;">$YMI</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #666666; font-style: italic;">// $content = iconv(&quot;CP1251&quot;, &quot;UTF-8//IGNORE&quot;, $content); // это строка ненужна така Яндекс маркет перешел на UTF-8</span>
&nbsp;
<span style="color: #000088;">$config</span> <span style="color: #339933;">=</span> <span style="color: #990000;">array</span><span style="color: #009900;">&#40;</span>
<span style="color: #0000ff;">&quot;indent&quot;</span>        <span style="color: #339933;">=&gt;</span> <span style="color: #009900; font-weight: bold;">true</span><span style="color: #339933;">,</span>
<span style="color: #0000ff;">&quot;output-xml&quot;</span>    <span style="color: #339933;">=&gt;</span> <span style="color: #009900; font-weight: bold;">true</span><span style="color: #339933;">,</span>
<span style="color: #0000ff;">&quot;wrap&quot;</span>          <span style="color: #339933;">=&gt;</span> <span style="color: #cc66cc;">200</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #000088;">$tidy</span> <span style="color: #339933;">=</span> <span style="color: #000000; font-weight: bold;">new</span> tidy<span style="color: #339933;">;</span>
<span style="color: #000088;">$tidy</span><span style="color: #339933;">-&gt;</span><span style="color: #004000;">parseString</span><span style="color: #009900;">&#40;</span><span style="color: #000088;">$content</span><span style="color: #339933;">,</span> <span style="color: #000088;">$config</span><span style="color: #339933;">,</span> <span style="color: #0000ff;">&quot;utf8&quot;</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
<span style="color: #000088;">$tidy</span><span style="color: #339933;">-&gt;</span><span style="color: #004000;">cleanRepair</span><span style="color: #009900;">&#40;</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #000088;">$xml</span><span style="color: #339933;">=</span><span style="color: #990000;">simplexml_load_string</span> <span style="color: #009900;">&#40;</span><span style="color: #000088;">$tidy</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #000088;">$groups</span> <span style="color: #339933;">=</span> <span style="color: #000088;">$xml</span><span style="color: #339933;">-&gt;</span><span style="color: #004000;">xpath</span><span style="color: #009900;">&#40;</span><span style="color: #0000ff;">&quot;//table[@class=&quot;</span>modelProperties<span style="color: #0000ff;">&quot;]//tr/td[@class=&quot;</span>title<span style="color: #0000ff;">&quot;]/b&quot;</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
<span style="color: #000088;">$strings</span> <span style="color: #339933;">=</span> <span style="color: #000088;">$xml</span><span style="color: #339933;">-&gt;</span><span style="color: #004000;">xpath</span><span style="color: #009900;">&#40;</span><span style="color: #0000ff;">&quot;//table[@class=&quot;</span>modelProperties<span style="color: #0000ff;">&quot;]//tr/td[@class=&quot;</span>title<span style="color: #0000ff;">&quot;]/b|//table[@class=\'modelProperties\']//td[@class=\'label\']/span/text()|//table[@class=\'modelProperties\']//tr/td[position()=2]/text()&quot;</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #000088;">$results_groups</span><span style="color: #339933;">=</span><span style="color: #990000;">array</span><span style="color: #009900;">&#40;</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
<span style="color: #000088;">$results</span><span style="color: #339933;">=</span><span style="color: #990000;">array</span><span style="color: #009900;">&#40;</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
<span style="color: #000088;">$i</span><span style="color: #339933;">=</span><span style="color: #cc66cc;">0</span><span style="color: #339933;">;</span>
<span style="color: #000088;">$GI</span><span style="color: #339933;">=</span><span style="color: #cc66cc;">0</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #666666; font-style: italic;">//print_r($strings);</span>
<span style="color: #b1b100;">while</span><span style="color: #009900;">&#40;</span><span style="color: #000088;">$i</span><span style="color: #339933;">&lt;</span>sizeof <span style="color: #009900;">&#40;</span><span style="color: #000088;">$strings</span><span style="color: #009900;">&#41;</span><span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #b1b100;">if</span><span style="color: #009900;">&#40;</span><span style="color: #990000;">in_array</span><span style="color: #009900;">&#40;</span><span style="color: #000088;">$strings</span><span style="color: #009900;">&#91;</span><span style="color: #000088;">$i</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">,</span><span style="color: #000088;">$groups</span><span style="color: #009900;">&#41;</span><span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #000088;">$GI</span><span style="color: #339933;">=</span><span style="color: #990000;">sizeof</span><span style="color: #009900;">&#40;</span><span style="color: #000088;">$results_groups</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
<span style="color: #000088;">$results_groups</span><span style="color: #009900;">&#91;</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">=</span><span style="color: #009900;">&#40;</span>string<span style="color: #009900;">&#41;</span><span style="color: #000088;">$strings</span><span style="color: #009900;">&#91;</span><span style="color: #000088;">$i</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">;</span>
<span style="color: #000088;">$results</span><span style="color: #009900;">&#91;</span><span style="color: #000088;">$GI</span><span style="color: #009900;">&#93;</span><span style="color: #009900;">&#91;</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">=</span><span style="color: #990000;">array</span><span style="color: #009900;">&#40;</span><span style="color: #009900;">&#40;</span>string<span style="color: #009900;">&#41;</span><span style="color: #000088;">$strings</span><span style="color: #009900;">&#91;</span><span style="color: #000088;">$i</span><span style="color: #339933;">+</span><span style="color: #cc66cc;">1</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">,</span><span style="color: #009900;">&#40;</span>string<span style="color: #009900;">&#41;</span><span style="color: #000088;">$strings</span><span style="color: #009900;">&#91;</span><span style="color: #000088;">$i</span><span style="color: #339933;">+</span><span style="color: #cc66cc;">2</span><span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
<span style="color: #000088;">$i</span><span style="color: #339933;">=</span><span style="color: #000088;">$i</span><span style="color: #339933;">+</span><span style="color: #cc66cc;">3</span><span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span>
<span style="color: #b1b100;">else</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #000088;">$results</span><span style="color: #009900;">&#91;</span><span style="color: #000088;">$GI</span><span style="color: #009900;">&#93;</span><span style="color: #009900;">&#91;</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">=</span><span style="color: #990000;">array</span><span style="color: #009900;">&#40;</span><span style="color: #009900;">&#40;</span>string<span style="color: #009900;">&#41;</span><span style="color: #000088;">$strings</span><span style="color: #009900;">&#91;</span><span style="color: #000088;">$i</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">,</span><span style="color: #009900;">&#40;</span>string<span style="color: #009900;">&#41;</span><span style="color: #000088;">$strings</span><span style="color: #009900;">&#91;</span><span style="color: #000088;">$i</span><span style="color: #339933;">+</span><span style="color: #cc66cc;">1</span><span style="color: #009900;">&#93;</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
<span style="color: #000088;">$i</span><span style="color: #339933;">=</span><span style="color: #000088;">$i</span><span style="color: #339933;">+</span><span style="color: #cc66cc;">2</span><span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span>
<span style="color: #009900;">&#125;</span>
&nbsp;
<span style="color: #000088;">$photos</span> <span style="color: #339933;">=</span> <span style="color: #000088;">$xml</span><span style="color: #339933;">-&gt;</span><span style="color: #004000;">xpath</span><span style="color: #009900;">&#40;</span><span style="color: #0000ff;">&quot;//table[@class=&quot;</span>modelpict<span style="color: #0000ff;">&quot;]//a/@href&quot;</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
<span style="color: #b1b100;">if</span><span style="color: #009900;">&#40;</span><span style="color: #990000;">empty</span><span style="color: #009900;">&#40;</span><span style="color: #000088;">$photos</span><span style="color: #009900;">&#41;</span><span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #000088;">$photos</span> <span style="color: #339933;">=</span> <span style="color: #000088;">$xml</span><span style="color: #339933;">-&gt;</span><span style="color: #004000;">xpath</span><span style="color: #009900;">&#40;</span><span style="color: #0000ff;">&quot;//table[@class=&quot;</span>modelpict<span style="color: #0000ff;">&quot;]//img/@src&quot;</span><span style="color: #009900;">&#41;</span><span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span>
&nbsp;
<span style="color: #b1b100;">foreach</span><span style="color: #009900;">&#40;</span> <span style="color: #000088;">$photos</span> <span style="color: #b1b100;">as</span> <span style="color: #000088;">$photo</span><span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #b1b100;">print</span> <span style="color: #0000ff;">&quot;&lt;img src=<span style="color: #000099; font-weight: bold;">\&quot;</span>&quot;</span><span style="color: #339933;">;</span>
<span style="color: #b1b100;">print</span> <span style="color: #009900;">&#40;</span>string<span style="color: #009900;">&#41;</span> <span style="color: #000088;">$photo</span><span style="color: #339933;">;</span>
<span style="color: #b1b100;">print</span> <span style="color: #0000ff;">&quot;<span style="color: #000099; font-weight: bold;">\&quot;</span> /&gt;&quot;</span><span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span>
&nbsp;
<span style="color: #b1b100;">print</span> <span style="color: #0000ff;">&quot;&lt;hr /&gt;&lt;table&gt;&lt;caption&gt;Технические характеристики&lt;/caption&gt;&quot;</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #b1b100;">foreach</span><span style="color: #009900;">&#40;</span><span style="color: #000088;">$results_groups</span> <span style="color: #b1b100;">as</span> <span style="color: #000088;">$k</span><span style="color: #339933;">=&gt;</span><span style="color: #000088;">$v</span><span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #b1b100;">print</span> <span style="color: #0000ff;">&quot;&lt;tr&gt;&lt;td colspan=<span style="color: #000099; font-weight: bold;">\&quot;</span>2<span style="color: #000099; font-weight: bold;">\&quot;</span> align=<span style="color: #000099; font-weight: bold;">\&quot;</span>center<span style="color: #000099; font-weight: bold;">\&quot;</span> style=<span style="color: #000099; font-weight: bold;">\&quot;</span>background-color:#ccc<span style="color: #000099; font-weight: bold;">\&quot;</span>&gt;&quot;</span><span style="color: #339933;">.</span><span style="color: #000088;">$v</span><span style="color: #339933;">.</span><span style="color: #0000ff;">&quot;&lt;/td&gt;&lt;/tr&gt;&quot;</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #b1b100;">foreach</span><span style="color: #009900;">&#40;</span><span style="color: #000088;">$results</span><span style="color: #009900;">&#91;</span><span style="color: #000088;">$k</span><span style="color: #009900;">&#93;</span> <span style="color: #b1b100;">as</span> <span style="color: #000088;">$arr</span><span style="color: #009900;">&#41;</span>
<span style="color: #009900;">&#123;</span>
<span style="color: #b1b100;">print</span> <span style="color: #0000ff;">&quot;&lt;tr&gt;&lt;td&gt;&quot;</span><span style="color: #339933;">.</span><span style="color: #000088;">$arr</span><span style="color: #009900;">&#91;</span><span style="color: #cc66cc;">0</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">.</span><span style="color: #0000ff;">&quot;&lt;/td&gt;&lt;td&gt;&quot;</span><span style="color: #339933;">.</span><span style="color: #000088;">$arr</span><span style="color: #009900;">&#91;</span><span style="color: #cc66cc;">1</span><span style="color: #009900;">&#93;</span><span style="color: #339933;">.</span><span style="color: #0000ff;">&quot;&lt;/td&gt;&lt;/tr&gt;&quot;</span><span style="color: #339933;">;</span>
<span style="color: #009900;">&#125;</span>
<span style="color: #009900;">&#125;</span>
<span style="color: #b1b100;">print</span> <span style="color: #0000ff;">&quot;&lt;/table&gt;&quot;</span><span style="color: #339933;">;</span>
&nbsp;
<span style="color: #000000; font-weight: bold;">?&gt;</span></pre></div></div>

]]></content:encoded>
			<wfw:commentRss>http://tpoxa.com/2008/09/09/marketyandexru-parser/feed/</wfw:commentRss>
		<slash:comments>16</slash:comments>
		</item>
	</channel>
</rss>
