diff options
author | Eduardo Pedroni <e.pedroni91@gmail.com> | 2015-06-01 17:17:30 +0200 |
---|---|---|
committer | Eduardo Pedroni <e.pedroni91@gmail.com> | 2015-06-01 17:17:30 +0200 |
commit | c072ed0ac345a7e421cb0612048d4cd534bf6e73 (patch) | |
tree | 96761c4f1ab9c4f8cf159b1cfe69477c4e745fe6 | |
parent | 5fd864718f4cd9e1e67835063a1efc24dd85e0c4 (diff) |
Forget scraping, there are already APIs for querying card data. I'm moving on to the collection management stage, scrape work will be set aside on a different branch for future reference, if needed
-rwxr-xr-x | cardbase.py | 69 | ||||
-rw-r--r-- | cardparser.py | 55 |
2 files changed, 87 insertions, 37 deletions
diff --git a/cardbase.py b/cardbase.py index c4bc17e..bb831b0 100755 --- a/cardbase.py +++ b/cardbase.py @@ -1,34 +1,73 @@ #!/usr/bin/env python3 +import cardparser import requests import sys from lxml import html +import yaml +import re -database = "" +def exit(msg=""): + if msg != "": + print(msg) + + #database.close() -def parseInput(raw): - if raw == "help": - print("Need help? too bad") - elif raw == " + sys.exit() def main(args): try: - database = open(args[1], "w") - except: + dataFile = args[1] + #database = open(args[1], "w") + except Exception as e: print("Please provide a valid database file as the first argument.") - sys.exit(1) + exit(e) print("Welcome to cardbase") print("For a list of commands, type \"help\"") - exit = False - while(not exit): - try: - raw = input("> ") - parseInput(raw) - except: - exit = True + globalSet = "" + + while(True): + #try: + raw = input("(" + globalSet + ")> ").strip() + args = re.split("[\t ]+", raw) + + if args[0] == "help": + print("Need help? try google.com") + + elif args[0] == "exit": + exit() + + elif args[0] == "set": + if args[1] and args[1] != "": + globalSet = re.sub("[^0-9A-Za-z]", "", args[1]) + else: + globalSet = "" + + elif args[0] == "save": + if args[1] and args[1] != "": + dataFile = args[1] + + elif args[0]: + if globalSet != "": + cardNo = re.sub("[^0-9A-Za-z]", "", args[0]) + print("Fetching card " + cardNo) + try: + pass + newCard = cardparser.fetchCard(globalSet, args[0]) + print(newCard.title) + except cardparser.CardNotFoundException as e: + print("Card not found.") + else: + print("Select a set with the \"set\" command before adding cards.") + + else: + print("Invalid input") + + #except Exception as e: + # exit(e) # The entry point diff --git a/cardparser.py b/cardparser.py index 759ad47..4a9de71 100644 --- a/cardparser.py +++ b/cardparser.py @@ -1,4 +1,5 @@ import re +import requests from lxml import html class Card(): @@ -20,29 +21,28 @@ class Card(): self.toughness = "" self.loyalty = "" +class CardNotFoundException(Exception): + pass + # fetching functions def makeUrl(cardSet, cardNo): return "http://magiccards.info/" + cardSet + "/en/" + cardNo + ".html" def remoteFetch(url): - return requests.get(url).text + return html.fromstring(requests.get(url).text) -def fetchCard(cardSet, cardNo): - # build object - card = Card() - card.edition = cardSet - card.scan = "http://magiccards.info/scans/en/" + cardSet + "/" + cardNo + ".jpg" - card.number = cardNo - - #setRemoteData(card, makeUrl(cardSet, cardNo)) - - return card +def isValid(page): + notFound = page.xpath("/html/body/h1/text()") + response404 = page.xpath("/html/body/h1/text()") + if notFound: + raise CardNotFoundException() -def setRemoteData(card, url, fetchCallback=remoteFetch): +def setRemoteData(card, url): # fetch card from upstream - page = html.fromstring(fetchCallback(url)) + page = html.fromstring(requests.get(url).text) + isValid(page) - # set remote data + # parse and set data card.title = getTitle(page) card.cost = getCost(page) card.convertedCost = getConvertedCost(page) @@ -57,6 +57,18 @@ def setRemoteData(card, url, fetchCallback=remoteFetch): card.toughness = getToughness(page) card.loyalty = getLoyalty(page) + +def fetchCard(cardSet, cardNo): + # build object + card = Card() + card.edition = cardSet + card.scan = "http://magiccards.info/scans/en/" + cardSet + "/" + cardNo + ".jpg" + card.number = cardNo + + setRemoteData(card, makeUrl(cardSet, cardNo)) + + return card + # parsing functions def getTitle(page): return page.xpath("/html/body/table[3]/tr/td[2]/span/a/text()")[0] @@ -65,7 +77,9 @@ def extractSubTitle(page): line = page.xpath("/html/body/table[3]/tr/td[2]/p[1]/text()")[0] line = re.sub("\n", "", line) line = re.sub(" +", " ", line) - return line.strip() + line = line.strip() + + return line def getCost(page): cost = extractSubTitle(page) @@ -88,18 +102,13 @@ def getConvertedCost(page): def getColour(page): colours = extractSubTitle(page) colours = re.search(" [0-9X]*([WGRBU\{\}/]*) ", colours) + if colours: colours = colours.group(1) - - colours = re.sub("U+", "U", colours) - colours = re.sub("W+", "W", colours) - colours = re.sub("R+", "R", colours) - colours = re.sub("B+", "B", colours) - colours = re.sub("G+", "G", colours) colours = re.sub("[\{\}/]*", "", colours) + colours = re.sub(r"(.)\1+", r"\1", colours) return colours - else: return "" @@ -121,10 +130,12 @@ def getSubType(page): def getArtist(page): artist = page.xpath("/html/body/table[3]/tr/td[2]/p[4]/text()")[0] artist = re.sub("Illus. ", "", artist) + return artist def getText(page): text = page.xpath("/html/body/table[3]/tr/td[2]/p[2]/b/text()") + return text def getFlavour(page): |