# this code is from this web page https://www.mssqltips.com/sqlservertip/6694/x/ # written by Nai Biao Zhou # January 2021 import sys, re, pyodbc from bs4 import BeautifulSoup from datetime import date, datetime from urllib.parse import urljoin, urlparse from urllib.request import urlopen, Request, URLError, HTTPError # Create a SQL Server database connection. def createDBConnection(server, database, user, password): ''' Take inputs server instance name, database name, username and password Return a SQL Server database connection ''' connection = pyodbc.connect(Driver="{ODBC Driver 17 for SQL Server}", Server=server, Database=database, UID=user, PWD=password) return connection # Close the database connection def closeDBConnection(connection): '''Take input connection and close the database connection''' try: connection.close() except pyodbc.ProgrammingError: pass # Get table summary from the database def getSummary(connection): '''Take input connection and retrieve summary information about the three tables''' cursor = connection.cursor() # Execute the SQL statements cursor.execute(''' SELECT [Author],[AuthorSinceYear],[TipContribution] FROM [dbo].[Author] SELECT YEAR([ArticleLastUpdate]) AS [YEAR], COUNT(*) AS SubTotal FROM [dbo].[Article] GROUP BY YEAR([ArticleLastUpdate]) SELECT COUNT(*) AS TotalLinks FROM [dbo].[Hyperlink] ''') print('Fetch rows from first set') rows=cursor.fetchall() for row in rows: print('Author: {0} Since Year: {1} Contributions: {2}'.format(row[0],row[1],row[2])) print('Fetch rows from second set') if cursor.nextset(): # fetch rows from next set, discarding first set rows=cursor.fetchall() for row in rows: print('Year: {0} Contributions Subtotal: {1}'.format(row[0], row[1])) print('Fetch rows from third set') if cursor.nextset(): # fetch a single value from next set count = cursor.fetchval() print('Total links: {0}'.format(count)) # Close the cursor cursor.close() # Helper function to delete a row in a database table def deleteHyperlinkIdInArticle(connection, articleId, hyperlinkId): '''Take input database connection, articleId and hyperlinkId. Detele the row accordingly''' cursor = connection.cursor() # Execute the query cursor.execute('''DELETE FROM [dbo].[HyperlinkIdInArticle] WHERE [ArticleId] = ? and [HyperlinkId] = ?''',articleId, hyperlinkId) print('Number of rows deleted: ', cursor.rowcount) connection.commit() # Close the cursor cursor.close() # Execute multiple SQL statements def clearDatabase(connection): '''Take input connection and delete all data in the database''' cursor = connection.cursor() # Execute the SQL statements cursor.execute('''DELETE FROM [dbo].[HyperlinkIdInArticle] DELETE FROM [dbo].[Hyperlink] DBCC CHECKIDENT ('dbo.Hyperlink', RESEED, 0) DELETE FROM [dbo].[Article] DBCC CHECKIDENT ('dbo.Article', RESEED, 0) DELETE FROM [dbo].[Author]''') connection.commit() # Close the cursor cursor.close() # Access an Internet address to check the accessibility def testHyperlinkAccessibility(url): '''Take input URL, and return the status code, response message and the date visited.''' statusCode, responseMessage = 'N/A', 'N/A' try: req = Request(url) req.add_header('User-Agent', 'Mozilla/5.0') response = urlopen(req) except HTTPError as e: statusCode = e.code except URLError as e: responseMessage = str(e.reason) except: responseMessage = sys.exc_info()[1] else: statusCode = response.getcode() finally: visitedDate = date.today() return statusCode, responseMessage, visitedDate # Update the accessibility of an Internet address def updateHyperlinkAccessibility(connection,hyperlinkUrl, statusCode, responseMessage, visitedDate): ''' Take input database connection, URL, status code, response message and date visited. Update the row in the database table [dbo].[Hyperlink] ''' cursor = connection.cursor() # Execute the query cursor.execute('''UPDATE [dbo].[Hyperlink] SET [StatusCode] = ? ,[ResponseMessage] = ? ,[VisitedDate] = ? WHERE HyperlinkUrl = ?''',statusCode, responseMessage, visitedDate, hyperlinkUrl) print('Number of rows updated: ', cursor.rowcount) connection.commit() # Close the cursor cursor.close() # Find all hyperlinks an author referenced def findAllHyperlinkUrlbyAuthor(connection,author): '''Take input author name, return a list of row objects.''' cursor = connection.cursor() # Execute the query cursor.execute('''SELECT distinct h.HyperlinkUrl FROM [Demo].[dbo].[Article] a INNER JOIN [dbo].[HyperlinkIdInArticle] ha ON a.ArticleId = ha.ArticleId INNER JOIN [dbo].[Hyperlink] h ON ha.HyperlinkId = h.HyperlinkId WHERE [Author] = ?''',author) # All rows are stored in memory rows = cursor.fetchall() # Close the cursor cursor.close() return rows # Find all hyperlinks an author referenced and process each row data def findAllHyperlinkUrlbyAuthor2(connection, author): '''Take input author name, process data in each row, return a list of row objects.''' rows = list() cursor = connection.cursor() # Execute the query cursor.execute('''SELECT h.HyperlinkId, h.HyperlinkUrl FROM [Demo].[dbo].[Article] a INNER JOIN [dbo].[HyperlinkIdInArticle] ha ON a.ArticleId = ha.ArticleId INNER JOIN [dbo].[Hyperlink] h ON ha.HyperlinkId = h.HyperlinkId WHERE [Author] = ?''',author) while True: # Access the query result row = cursor.fetchone() if not row: break # Returns None when all remaining rows have been retrieved. hyperlinkId = row[0] # Access by column index (zero-based) hyperlinkUrl = row.HyperlinkUrl # Access by name rows.append(row) # Process data print('HyperlinkId: {0} URL: {1}'.format(hyperlinkId, hyperlinkUrl)) # Close the cursor cursor.close() return rows # Find all hyperlinks an author referenced and process row objects in batches def findAllHyperlinkUrlbyAuthor3(connection, author): '''Take input author name, process row objects in batches, return a list of objects.''' rows = list() cursor = connection.cursor() # Execute the query cursor.execute('''SELECT h.HyperlinkId, h.HyperlinkUrl FROM [Demo].[dbo].[Article] a INNER JOIN [dbo].[HyperlinkIdInArticle] ha ON a.ArticleId = ha.ArticleId INNER JOIN [dbo].[Hyperlink] h ON ha.HyperlinkId = h.HyperlinkId WHERE [Author] = ?''',author) while True: # Access the query result records = cursor.fetchmany(5) # The maximum number of rows in records is 5 if not records: break for row in records: hyperlinkId = row[0] # Access by column index (zero-based) hyperlinkUrl = row.HyperlinkUrl # Access by name rows.append(row) print('HyperlinkId: {0} URL: {1}'.format(hyperlinkId, hyperlinkUrl)) print('Loaded {0} rows from the database to a batch'.format(len(records))) # Close the cursor cursor.close() return rows # Save all hyperlink data into the database def saveArticleHyperLinks(connection, articleUrl, hyperlinks): ''' Take input database connection, article URL and a list of hyperlinks. Insert data and their relationships into the database tables ''' cursor = connection.cursor() for link in hyperlinks: cursor.execute(''' DECLARE @ArticleUrl nvarchar(1024) = ? ,@LinkTitle nvarchar(1024) = ? ,@HyperlinkUrl nvarchar(1024) = ? ,@ArticleId int = 0 ,@HyperlinkId int = 0 SELECT @ArticleId = [ArticleId] FROM [dbo].[Article] WHERE [ArticleUrl] = @ArticleUrl SELECT @HyperlinkId = [HyperlinkId] FROM [dbo].[Hyperlink] WHERE [HyperlinkUrl] = @HyperlinkUrl AND [LinkTitle] = @LinkTitle IF NOT EXISTS (SELECT * FROM HyperlinkIdInArticle WHERE ArticleId = @ArticleId AND HyperlinkId = @HyperlinkId) BEGIN INSERT INTO [dbo].[Hyperlink] ([LinkTitle],[HyperlinkUrl],[StatusCode],[ResponseMessage] ,[VisitedDate]) VALUES (@LinkTitle,@HyperlinkUrl,'N/A','N/A','1900-01-01') SELECT @HyperlinkId = MAX(HyperlinkId) FROM [dbo].[Hyperlink] INSERT INTO [dbo].[HyperlinkIdInArticle]([ArticleId] ,[HyperlinkId]) VALUES(@ArticleId,@HyperlinkId) END ''',articleUrl, link[0], link[1]) connection.commit() cursor.close() # Web Scraping with Python, Second Edition by Ryan Mitchell (O’Reilly). # Copyright 2018 Ryan Mitchell, 978-1-491-998557-1. # Find all internal hyperlinks in an article def findInternalLinks(bsObj, articleUrl): ''' Takes a BeautifulSoup object and the article URL Return a set of internal hyperlinks in the article ''' internalLinks = set() siteUrl = urlparse(articleUrl).scheme+'://'+urlparse(articleUrl).netloc #Finds all links that begin with a "/" links = bsObj.article.findAll('a', href=re.compile('^(/|'+siteUrl+')')) for link in links: if(link.attrs['href'].startswith('/cdn-cgi/l/email-protection')): pass elif (link.attrs['href'].startswith('/')): internalLinks.add((link.get_text(), urljoin(articleUrl, link.attrs['href']))) else: internalLinks.add((link.get_text(), link.attrs['href'])) return internalLinks # Find all external hyperlinks in an article def findExternalLinks(bsObj, articleUrl): ''' Takes a BeautifulSoup object and the article URL Return a set of external hyperlinks in the article ''' externalLinks = set() siteDomain = urlparse(articleUrl).netloc # Finds all links that start with "http" or "www" that do # not contain the current domain links = bsObj.article.findAll('a', href=re.compile('^(http|https|www)((?!'+siteDomain+').)*$')) for link in links: externalLinks.add((link.get_text(), link.attrs['href'])) return externalLinks # Find all hyperlinks in an article def findAllHyperLinks(articleUrl): ''' Takes an article URL. Return a set of hyperlinks in the article''' req = Request(articleUrl) req.add_header('User-Agent', 'Mozilla/5.0') response = urlopen(req) bsObj = BeautifulSoup(response, 'html.parser') # Decompose the task to smaller tasks internalLinks = findInternalLinks(bsObj, articleUrl) externalLinks = findExternalLinks(bsObj, articleUrl) return externalLinks.union(internalLinks) # Save multiple articles into the database def saveAuthorArticles(connection, articles): '''Takes input database connection and a list of tuples containing article data''' cursor = connection.cursor() cursor.fast_executemany = True cursor.executemany(''' INSERT INTO [dbo].[Article]([Title],[Author],[ArticleUrl],[ArticleLastUpdate]) VALUES(?,?,?,?)''',articles) connection.commit() cursor.close() # Find all author's articles on the profile page def findAuthorArticles(author, authorProfileUrl): ''' Takes author name and author's profile page URL Return a list of articles contributed by a specific author ''' rtnList = list() req = Request(authorProfileUrl) req.add_header('User-Agent', 'Mozilla/5.0') response = urlopen(req) bsObj = BeautifulSoup(response, "html.parser") tags = bsObj.findAll(lambda tag: tag.name == 'a' and len(tag.attrs) == 1 and 'href' in tag.attrs and tag.attrs['href'].startswith('/sqlservertip/')) for tag in tags: m = re.search('[\d]{1,2}/[\d]{1,2}/[\d]{4}', tag.next_sibling) articleLastUpdate = datetime.strptime(m.group(0), '%m/%d/%Y') title = tag.get_text() articleUrl = urljoin(authorProfileUrl, tag.get('href')) rtnList.append((title, author, articleUrl, articleLastUpdate,)) return rtnList # Create an author row in the database table def saveAuthor(connection, author, authorProfileUrl, authorSinceYear, tipContribution): ''' Take input database connection and profile data Save author profile into the table [dbo].[Author] ''' cursor = connection.cursor() params = (author, authorProfileUrl, authorSinceYear, tipContribution) cursor.execute("{CALL dbo.sp_add_author (?,?,?,?)}", params) connection.commit() cursor.close() # Call a stored procedure with output parameters and return values def saveAuthor2(connection, author, authorProfileUrl, authorSinceYear, tipContribution): ''' Take input database connection and author profile data Save author profile data into the table [dbo].[Author] ''' cursor = connection.cursor() params = (author, authorProfileUrl, authorSinceYear, tipContribution) cursor.execute('''SET NOCOUNT ON; DECLARE @retValue int, @rowCountVar int; EXEC @retValue = dbo.sp_add_author2 ?,?,?,?, @rowCountVar OUTPUT; SELECT @retValue, @rowCountVar;''', params) row = cursor.fetchone() connection.commit() # For illustrative purposes only print('Return Value: {0}, Output Value: {1}'.format(row[0], row[1])) cursor.close() # Search author's profile def findAuthor(author, authorsPageUrl): ''' Take input author name and authors' page URL Return the individual profile page URL, the participation year, and the number of tips ''' req = Request(authorsPageUrl) req.add_header('User-Agent', 'Mozilla/5.0') response = urlopen(req) bsObj = BeautifulSoup(response, 'html.parser') authorLink = bsObj.find('a', href = True, text = author) authorProfileUrl = urljoin(authorsPageUrl,authorLink.get('href')) authorSinceYear = authorLink.parent.parent.strong.get_text() tipContribution = authorLink.parent.parent.strong.find_next('strong').get_text() return authorProfileUrl, authorSinceYear, tipContribution # Execute multiple SQL statements def clearDatabase(connection): '''Take input connection and delete all data in the database''' cursor = connection.cursor() # Execute the SQL statements cursor.execute('''DELETE FROM [dbo].[HyperlinkIdInArticle] DELETE FROM [dbo].[Hyperlink] DBCC CHECKIDENT ('dbo.Hyperlink', RESEED, 0) DELETE FROM [dbo].[Article] DBCC CHECKIDENT ('dbo.Article', RESEED, 0) DELETE FROM [dbo].[Author]''') connection.commit() # Close the cursor cursor.close() # The entry point of the program if __name__ == "__main__": # 1, Initizlize variables # Specify server name, database, user and password server = '192.168.2.17' database = 'Demo' user = 'user' password = 'password' # Specify the author name for data collection author = 'Nai Biao Zhou' # Specify the authors page url authorsPageUrl = 'https://www.mssqltips.com/sql-server-mssqltips-authors/' # 2, Remove all testing data in the database connection = createDBConnection(server,database,user,password) clearDatabase(connection) closeDBConnection(connection) # 3, Extract the Author's data and save them into the database authorProfileUrl, authorSinceYear, tipContribution = findAuthor(author, authorsPageUrl) connection = createDBConnection(server,database,user,password) saveAuthor(connection, author, authorProfileUrl, authorSinceYear, tipContribution) closeDBConnection(connection) # 4, Extract the list of author's articles and save them into the database authorArticles = findAuthorArticles(author, authorProfileUrl) connection = createDBConnection(server,database,user,password) saveAuthorArticles(connection, authorArticles) closeDBConnection(connection) # 5, Extract the all hyperlinks in author's articles and save them into the database for article in authorArticles: hyperLinks = findAllHyperLinks(article[2]) connection = createDBConnection(server,database,user,password) saveArticleHyperLinks(connection, article[2], hyperLinks) closeDBConnection(connection) # 6, Read the all hyperlinks in author's articles from the database connection = createDBConnection(server,database,user,password) rows = findAllHyperlinkUrlbyAuthor(connection,author) closeDBConnection(connection) # 7, Update the accessbility in the datatabse for row in rows: hyperlinkUrl = row.HyperlinkUrl # Access by name statusCode, responseMessage, visitedDate = testHyperlinkAccessibility(hyperlinkUrl) print('checking: ',hyperlinkUrl) connection = createDBConnection(server,database,user,password) updateHyperlinkAccessibility(connection, hyperlinkUrl, statusCode, responseMessage, visitedDate) closeDBConnection(connection) # 8, Output summary report connection = createDBConnection(server,database,user,password) getSummary(connection) closeDBConnection(connection)