Extracting Cricket Player's statistics using Python and BeautifulSoup

When trying out BeautifulSoup, which is an excellent Python library for pulling data out of HTML and XML files, I created a simple Python script to test BeautifulSoup's might! Below is the script which extracts out full details of a cricket player, batting and bowling stats from a famous cricket website to be conveniently use in Python, check it out:

from bs4 import BeautifulSoup
import requests, re

class CricketPlayerInfo():                                     

	def __init__(self):
		self.main_site = 'http://www.espncricinfo.com'

	def extractPlayer(self, url):
		
			url_handler = requests.get(url) 
			soup = BeautifulSoup(url_handler.content, 'lxml')
		
			data_getter = soup.find('div', {'class':'pnl490M'})
			player_detail_getter = data_getter.findAll('p', {'class':'ciPlayerinformationtxt'})
			stats_getter = data_getter.findAll('table', {'class':'engineTable'})

			# Get Basic information
			try:
				name = data_getter.find('h1').get_text().strip()
				country = data_getter.find('h3', {'class':'PlayersSearchLink'}).get_text().strip()
				full_name = player_detail_getter[0].find('span').get_text().strip()
				born = player_detail_getter[1].find('span').get_text().strip()
				age = player_detail_getter[2].find('span').get_text().strip()
				major_teams = player_detail_getter[3].find('span').get_text().strip()
				playing_role = player_detail_getter[4].find('span').get_text().strip()
				batting_style = player_detail_getter[5].find('span').get_text().strip()
				bowling_style = player_detail_getter[6].find('span').get_text().strip()
				relation = player_detail_getter[7].find('span').get_text().strip()
			except:
				pass

			# Get Batting details
			batting_stats = stats_getter[0]
			batting_stats = batting_stats.findAll('tr', {'class':'data1'})

			bat_stats = {}
			key = ''
			counter = 1
			for batting in batting_stats:
				key_values = ['matches','innings','no', 'runs', 'hs', 'ave', 'bf', 'sr', '100', '50', '4s', '6s', 'Ct', 'St']
				for bat in batting.findAll('td'):
					text = bat.get_text()
					pattern = re.compile('^[0-9\.\*\/]+$')
					if (pattern.match(text)):
						if (len(key_values) < 1):
							break;

						dest = dict(bat_stats)
						dest[key].update({key_values[0]:text})
						key_values.pop(0)
					else:
						key = text.lower().replace(' ', '-')
						bat_stats[key] = {}

					counter = counter + 1


			# Get Bowling details
			bowling_stats = stats_getter[1]
			bowling_stats = bowling_stats.findAll('tr', {'class':'data1'})

			bowl_stats = {}
			key_values = {}
			key = ''
			counter = 1
			for bowling in bowling_stats:
				key_values = ['matches','innings','balls', 'runs', 'wickets', 'bbi', 'bbm', 'ave', 'economy', 'sr', '4w', '5w', '10']
				for bowl in bowling.findAll('td'):
					text = bowl.get_text()
					pattern = re.compile('^[0-9\.\*\/]+$')
					if (pattern.match(text)):
						if (len(key_values) < 1):
							break;

						dest = dict(bowl_stats)
						dest[key].update({key_values[0]:text})
						key_values.pop(0)
					else:
						key = text.lower().replace(' ', '-')
						bowl_stats[key] = {}

					counter = counter + 1

			# Career stats
			career_stats = stats_getter[2]
			car_stats = {}
			career = career_stats.findAll('td')

			try:
				car_stats['test_debut'] = career[1].get_text().strip(' scorecard')
				car_stats['last_test'] = career[3].get_text().strip(' scorecard')
				car_stats['odi_debut'] = career[7].get_text().strip(' scorecard')
				car_stats['last_odi'] = career[9].get_text().strip(' scorecard')
				car_stats['t20i_debut'] = career[13].get_text().strip(' scorecard')
				car_stats['last_t20i'] = career[15].get_text().strip(' scorecard')
			except:
				pass

			# Player Description
			try:
				desc = data_getter.find('p', {'class':'ciPlayerprofiletext1'}).get_text()
			except:
				desc = 'NA'

			# Latest news on player
			try:
				links = data_getter.find('div', {'class':'headline'}).find('ul')
				links = links.findAll('a')
				news = []
				for link in links:
					news.append((self.main_site+link['href']))
			except:
				pass


			try:
				avg =  bat_stats['odis']['ave']
			except:
				avg = "NA"

			return country + " --- " + name + ' -- Batting AVG: ' + avg

	def getPlayers(self, url):
		links = []
		url_handler = requests.get(url) 
		soup = BeautifulSoup(url_handler.content, 'lxml')

		players_links = soup.findAll('a', {'href':re.compile('(/content/player/)[0-9]+\.html$')})
		for player in players_links:
			link = self.main_site+player['href']
			if link not in links:
				links.append(link)
				print(self.extractPlayer(link))

	def getPlayersByCountry(self):
		url = 'http://www.espncricinfo.com/ci/content/player/index.html'
		url_handler = requests.get(url) 
		soup = BeautifulSoup(url_handler.content, 'lxml')

		country_urls = soup.findAll('a', {'href':re.compile('country\.html\?country\=[0-9]+$')})
		for country in country_urls:
			link = self.main_site + country['href']
			print('-------')
			print(link)
			self.getPlayers(link)

get_info = CricketPlayerInfo()
get_info.getPlayersByCountry()

Have fun :-)

New Comment