Python HTMLParser

py_vk

How to spent two days if you know nothing about Python:

  • need parse HTML page code, where VK id and username of every person who shared post stores

 

with open('test.html', 'r', encoding='utf-8') as content_file:
    read_data = content_file.read()

from html.parser import HTMLParser
import re

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        vk_id = str(attrs)
        for line in vk_id:
            vk = re.findall('/\S+$', vk_id)
        vk_fnd = str(vk)
        if re.search('/\w+\'\)\]', vk_fnd):
            global vk_read
            vk_read = vk_fnd
            for ch in ['/', ')', '[', ']', '"', "'"]:
                if ch in vk_read:
                    vk_read = vk_read.replace(ch, "")
    def handle_data(self, data):
        global vk_name
        vk_name = str(data)
        assert isinstance(data, object)
        for line in vk_name:
            if re.match('\S+\s+\S+$', vk_name):
                print("@{0} - {1}".format(vk_read, vk_name))
                break


parser = MyHTMLParser()
parser.feed(read_data)

Now I know more.

First bug and first fix:

UnicodeEncodeError: ‘charmap’ codec can’t encode character ‘\u0406’ in position 15: character maps to <undefined>

with open('test.html', 'r', encoding='utf-8') as content_file:
    read_data = content_file.read()
'''
1. Replased error with charset by replase character
'''


from html.parser import HTMLParser
import re, sys

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        vk_id = str(attrs)
        for line in vk_id:
            vk = re.findall('/\S+$', vk_id)
        vk_fnd = str(vk)
        if re.search('/\w+\'\)\]', vk_fnd):
            global vk_read
            vk_read = vk_fnd
            for ch in ['/', ')', '[', ']', '"', "'"]:
                if ch in vk_read:
                    vk_read = vk_read.replace(ch, "")
    def handle_data(self, data):
        global vk_name
        vk_name = str(data)
        for line in vk_name:
            if re.match('\S+\s+\S+$', vk_name):
                for ch in ['\u0456', '\u0406']:
                    if ch in vk_name:
                        vk_name = vk_name.replace(ch, "?")
                print("@{0} - {1}".format(vk_read, vk_name))
                break


parser = MyHTMLParser()
parser.feed(read_data)

UPD2: I have found another one solution for this code, just re-thinking it’s logic. That way I use in vk = re.findall(‘/\S+$’, vk_id) can be simplified. I use vk_id = str(attrs) to convert list to string and then found there something matches regex.

! BUT I should just address to value from list !

example:

attrs =&nbsp;[('href', '/id168265578'), ('class', 'like_img_cont')]
print "attrs[1]: ", attrs[1]

Will upgrade this section later.

 

 

About trianglesis

Александр Брюндтзвельт - гений, филантроп, 100 гривен в кармане. Этот блог - "сток" моих мыслей и заметок. Достаточно одного взгляда на него, чтобы понять, что такой же бардак творится у меня в голове. Если вам этот бардак интересен - милости прошу.
Bookmark the permalink.