# this code is from this web page https://www.mssqltips.com/sqlservertip/6694/x/
# written by Nai Biao Zhou
# January 2021

import sys, re, pyodbc
from bs4 import BeautifulSoup
from datetime import date, datetime
from urllib.parse import urljoin, urlparse
from urllib.request import urlopen, Request, URLError, HTTPError

# Create a SQL Server database connection.
def createDBConnection(server, database, user, password):
    '''
    Take inputs server instance name, database name, username and password
    Return a SQL Server database connection
    '''
    connection = pyodbc.connect(Driver="{ODBC Driver 17 for SQL Server}",
                            Server=server,
                            Database=database,
                            UID=user,
                            PWD=password)
    return connection

# Close the database connection
def closeDBConnection(connection):
    '''Take input connection and close the database connection'''
    try:
        connection.close()
    except pyodbc.ProgrammingError:
        pass

# Get table summary from the database
def getSummary(connection):
    '''Take input connection and retrieve summary information about the three tables'''
    cursor = connection.cursor()
    # Execute the SQL statements
    cursor.execute('''
                SELECT [Author],[AuthorSinceYear],[TipContribution]
                  FROM [dbo].[Author]
                SELECT YEAR([ArticleLastUpdate]) AS [YEAR], COUNT(*) AS SubTotal			
                  FROM [dbo].[Article]
                  GROUP BY YEAR([ArticleLastUpdate])
                SELECT COUNT(*) AS TotalLinks
                  FROM [dbo].[Hyperlink]
                      ''')
    print('Fetch rows from first set')
    rows=cursor.fetchall()
    for row in rows:
        print('Author: {0} Since Year: {1} Contributions: {2}'.format(row[0],row[1],row[2]))
    print('Fetch rows from second set')
    if cursor.nextset():
        # fetch rows from next set, discarding first set
        rows=cursor.fetchall()
        for row in rows:
            print('Year: {0} Contributions Subtotal: {1}'.format(row[0], row[1]))
    print('Fetch rows from third set')
    if cursor.nextset():
        # fetch a single value from next set
        count = cursor.fetchval()
        print('Total links: {0}'.format(count))
    # Close the cursor
    cursor.close()

# Helper function to delete a row in a database table
def deleteHyperlinkIdInArticle(connection, articleId, hyperlinkId):
    '''Take input database connection, articleId and hyperlinkId. Detele the row accordingly'''
    cursor = connection.cursor()
    # Execute the query
    cursor.execute('''DELETE FROM [dbo].[HyperlinkIdInArticle]
                        WHERE [ArticleId] = ? and [HyperlinkId] = ?''',articleId, hyperlinkId)
    print('Number of rows deleted: ', cursor.rowcount)
    connection.commit()
    # Close the cursor
    cursor.close()

# Execute multiple SQL statements
def clearDatabase(connection):
    '''Take input connection and delete all data in the database'''
    cursor = connection.cursor()
    # Execute the SQL statements
    cursor.execute('''DELETE FROM [dbo].[HyperlinkIdInArticle]
                      DELETE FROM [dbo].[Hyperlink]
                      DBCC CHECKIDENT ('dbo.Hyperlink', RESEED, 0) 
                      DELETE FROM [dbo].[Article]
                      DBCC CHECKIDENT ('dbo.Article', RESEED, 0)  
                      DELETE FROM [dbo].[Author]''')
    connection.commit()
    # Close the cursor
    cursor.close()

# Access an Internet address to check the accessibility
def testHyperlinkAccessibility(url):
    '''Take input URL, and return the status code, response message and the date visited.'''
    statusCode, responseMessage = 'N/A', 'N/A'
    try:
        req = Request(url)
        req.add_header('User-Agent', 'Mozilla/5.0')
        response = urlopen(req)
    except HTTPError as e:
        statusCode = e.code
    except URLError as e:
        responseMessage = str(e.reason)
    except:
        responseMessage = sys.exc_info()[1]
    else:
        statusCode = response.getcode()
    finally:
        visitedDate = date.today()
    return statusCode, responseMessage, visitedDate

# Update the accessibility of an Internet address
def updateHyperlinkAccessibility(connection,hyperlinkUrl, statusCode, responseMessage, visitedDate):
    '''
    Take input database connection, URL, status code, response message and date visited. 
    Update the row in the database table [dbo].[Hyperlink]
    '''
    cursor = connection.cursor()
    # Execute the query
    cursor.execute('''UPDATE [dbo].[Hyperlink]
                       SET [StatusCode] = ?
                          ,[ResponseMessage] = ?
                          ,[VisitedDate] = ?
                     WHERE HyperlinkUrl = ?''',statusCode, responseMessage, visitedDate, hyperlinkUrl)
    print('Number of rows updated: ', cursor.rowcount)
    connection.commit()
    # Close the cursor
    cursor.close()

# Find all hyperlinks an author referenced
def findAllHyperlinkUrlbyAuthor(connection,author):
    '''Take input author name, return a list of row objects.'''
    cursor = connection.cursor()
    # Execute the query
    cursor.execute('''SELECT distinct h.HyperlinkUrl
                      FROM [Demo].[dbo].[Article] a 
	                    INNER JOIN [dbo].[HyperlinkIdInArticle] ha ON a.ArticleId = ha.ArticleId
	                    INNER JOIN [dbo].[Hyperlink] h ON ha.HyperlinkId = h.HyperlinkId
                WHERE [Author] = ?''',author)
    # All rows are stored in memory
    rows = cursor.fetchall()
    # Close the cursor
    cursor.close()    
    return rows

# Find all hyperlinks an author referenced and process each row data
def findAllHyperlinkUrlbyAuthor2(connection, author):
    '''Take input author name, process data in each row, return a list of row objects.'''
    rows = list()
    cursor = connection.cursor()
    # Execute the query
    cursor.execute('''SELECT h.HyperlinkId, h.HyperlinkUrl
                      FROM [Demo].[dbo].[Article] a 
	                    INNER JOIN [dbo].[HyperlinkIdInArticle] ha ON a.ArticleId = ha.ArticleId
	                    INNER JOIN [dbo].[Hyperlink] h ON ha.HyperlinkId = h.HyperlinkId
                WHERE [Author] = ?''',author)
    while True:
        # Access the query result
        row = cursor.fetchone()
        if not row:
            break                           # Returns None when all remaining rows have been retrieved.
        hyperlinkId = row[0]                # Access by column index (zero-based)
        hyperlinkUrl = row.HyperlinkUrl     # Access by name
        rows.append(row)                    # Process data
        print('HyperlinkId: {0} URL: {1}'.format(hyperlinkId, hyperlinkUrl))
    # Close the cursor
    cursor.close()
    return rows

# Find all hyperlinks an author referenced and process row objects in batches
def findAllHyperlinkUrlbyAuthor3(connection, author):
    '''Take input author name, process row objects in batches, return a list of objects.'''
    rows = list()
    cursor = connection.cursor()
    # Execute the query
    cursor.execute('''SELECT h.HyperlinkId, h.HyperlinkUrl
                      FROM [Demo].[dbo].[Article] a 
	                    INNER JOIN [dbo].[HyperlinkIdInArticle] ha ON a.ArticleId = ha.ArticleId
	                    INNER JOIN [dbo].[Hyperlink] h ON ha.HyperlinkId = h.HyperlinkId
                WHERE [Author] = ?''',author)
    while True:
        # Access the query result
        records = cursor.fetchmany(5)           # The maximum number of rows in records is 5
        if not records:
            break
        for row in records:                        
            hyperlinkId = row[0]                # Access by column index (zero-based)
            hyperlinkUrl = row.HyperlinkUrl     # Access by name
            rows.append(row)
            print('HyperlinkId: {0} URL: {1}'.format(hyperlinkId, hyperlinkUrl))
        print('Loaded {0} rows from the database to a batch'.format(len(records)))
    # Close the cursor
    cursor.close()
    return rows

# Save all hyperlink data into the database
def saveArticleHyperLinks(connection, articleUrl, hyperlinks):
    '''
    Take input database connection, article URL and a list of hyperlinks.
    Insert data and their relationships into the database tables
    '''
    cursor = connection.cursor()
    for link in hyperlinks:
        cursor.execute('''
                        DECLARE  @ArticleUrl nvarchar(1024) = ?
		                  ,@LinkTitle  nvarchar(1024) = ?
		                  ,@HyperlinkUrl nvarchar(1024) = ?
	                         ,@ArticleId int = 0
		                  ,@HyperlinkId int = 0
                        SELECT @ArticleId = [ArticleId]
                          FROM [dbo].[Article]
                        WHERE [ArticleUrl] = @ArticleUrl
                        SELECT @HyperlinkId = [HyperlinkId]
                          FROM [dbo].[Hyperlink]
                        WHERE [HyperlinkUrl] = @HyperlinkUrl AND [LinkTitle] = @LinkTitle
                        IF NOT EXISTS (SELECT * FROM HyperlinkIdInArticle 
                                    WHERE ArticleId = @ArticleId AND HyperlinkId = @HyperlinkId)
                        BEGIN
                            INSERT INTO [dbo].[Hyperlink]
                                       ([LinkTitle],[HyperlinkUrl],[StatusCode],[ResponseMessage]
                                       ,[VisitedDate])
                                 VALUES
                                       (@LinkTitle,@HyperlinkUrl,'N/A','N/A','1900-01-01')
                            SELECT @HyperlinkId = MAX(HyperlinkId)
                             FROM [dbo].[Hyperlink]
                            INSERT INTO [dbo].[HyperlinkIdInArticle]([ArticleId] ,[HyperlinkId])                                      
                                 VALUES(@ArticleId,@HyperlinkId)                                          
                         END             
                    ''',articleUrl, link[0], link[1])
        connection.commit()
    cursor.close()

# Web Scraping with Python, Second Edition by Ryan Mitchell (O’Reilly). 
# Copyright 2018 Ryan Mitchell, 978-1-491-998557-1.
# Find all internal hyperlinks in an article
def findInternalLinks(bsObj, articleUrl):
    '''
    Takes a BeautifulSoup object and the article URL
    Return a set of internal hyperlinks in the article
    '''
    internalLinks = set()
    siteUrl = urlparse(articleUrl).scheme+'://'+urlparse(articleUrl).netloc
    #Finds all links that begin with a "/"
    links = bsObj.article.findAll('a', href=re.compile('^(/|'+siteUrl+')'))
    for link in links:
        if(link.attrs['href'].startswith('/cdn-cgi/l/email-protection')):
            pass
        elif (link.attrs['href'].startswith('/')):
            internalLinks.add((link.get_text(), urljoin(articleUrl, link.attrs['href'])))
        else:
            internalLinks.add((link.get_text(), link.attrs['href']))
    return internalLinks

# Find all external hyperlinks in an article
def findExternalLinks(bsObj, articleUrl):
    '''
    Takes a BeautifulSoup object and the article URL
    Return a set of external hyperlinks in the article
    '''
    externalLinks = set()
    siteDomain = urlparse(articleUrl).netloc
    # Finds all links that start with "http" or "www" that do
    # not contain the current domain
    links = bsObj.article.findAll('a', href=re.compile('^(http|https|www)((?!'+siteDomain+').)*$'))
    for link in links:
        externalLinks.add((link.get_text(), link.attrs['href']))
    
    return externalLinks

# Find all hyperlinks in an article
def findAllHyperLinks(articleUrl):
    ''' Takes an article URL. Return a set of hyperlinks in the article'''
    req = Request(articleUrl)
    req.add_header('User-Agent', 'Mozilla/5.0')
    response = urlopen(req)
    bsObj = BeautifulSoup(response, 'html.parser')
    # Decompose the task to smaller tasks
    internalLinks = findInternalLinks(bsObj, articleUrl)
    externalLinks = findExternalLinks(bsObj, articleUrl)
    return externalLinks.union(internalLinks)

# Save multiple articles into the database
def saveAuthorArticles(connection, articles):
    '''Takes input database connection and a list of tuples containing article data'''
    cursor = connection.cursor()
    cursor.fast_executemany = True
    cursor.executemany('''
        INSERT INTO [dbo].[Article]([Title],[Author],[ArticleUrl],[ArticleLastUpdate])
        VALUES(?,?,?,?)''',articles)
    connection.commit()
    cursor.close()

# Find all author's articles on the profile page
def findAuthorArticles(author, authorProfileUrl):
    '''
    Takes author name and author's profile page URL
    Return a list of articles contributed by a specific author
    '''
    rtnList = list()
    req = Request(authorProfileUrl)
    req.add_header('User-Agent', 'Mozilla/5.0')
    response = urlopen(req)
    bsObj = BeautifulSoup(response, "html.parser")
    tags = bsObj.findAll(lambda tag: tag.name == 'a' 
                         and len(tag.attrs) == 1 
                         and 'href' in tag.attrs
                         and tag.attrs['href'].startswith('/sqlservertip/'))
    for tag in tags:
        m = re.search('[\d]{1,2}/[\d]{1,2}/[\d]{4}', tag.next_sibling)
        articleLastUpdate = datetime.strptime(m.group(0), '%m/%d/%Y')
        title = tag.get_text()
        articleUrl = urljoin(authorProfileUrl, tag.get('href'))
        rtnList.append((title, author, articleUrl, articleLastUpdate,))
    return rtnList

# Create an author row in the database table
def saveAuthor(connection, author, authorProfileUrl, authorSinceYear, tipContribution):
    '''
    Take input database connection and profile data
    Save author profile into the table [dbo].[Author]
    '''
    cursor = connection.cursor()
    params = (author, authorProfileUrl, authorSinceYear, tipContribution)
    cursor.execute("{CALL dbo.sp_add_author (?,?,?,?)}", params)
    connection.commit()
    cursor.close()

# Call a stored procedure with output parameters and return values
def saveAuthor2(connection, author, authorProfileUrl, authorSinceYear, tipContribution):
    '''
    Take input database connection and author profile data
    Save author profile data into the table [dbo].[Author]
    '''
    cursor = connection.cursor()
    params = (author, authorProfileUrl, authorSinceYear, tipContribution)
    cursor.execute('''SET NOCOUNT ON;
                        DECLARE @retValue int, @rowCountVar int;
                        EXEC @retValue = dbo.sp_add_author2 ?,?,?,?, @rowCountVar OUTPUT;
                        SELECT @retValue, @rowCountVar;''', params)
    row = cursor.fetchone()   
    connection.commit()
    # For illustrative purposes only
    print('Return Value: {0}, Output Value: {1}'.format(row[0], row[1]))
    cursor.close()

# Search author's profile
def findAuthor(author, authorsPageUrl):
    '''
    Take input author name and authors' page URL
    Return the individual profile page URL, the participation year, and the number of tips 
    '''
    req = Request(authorsPageUrl)
    req.add_header('User-Agent', 'Mozilla/5.0')
    response = urlopen(req)
    bsObj = BeautifulSoup(response, 'html.parser')
    authorLink = bsObj.find('a', href = True, text = author)
    authorProfileUrl = urljoin(authorsPageUrl,authorLink.get('href'))
    authorSinceYear = authorLink.parent.parent.strong.get_text()
    tipContribution = authorLink.parent.parent.strong.find_next('strong').get_text()
    return authorProfileUrl, authorSinceYear, tipContribution

# Execute multiple SQL statements
def clearDatabase(connection):
    '''Take input connection and delete all data in the database'''
    cursor = connection.cursor()
    # Execute the SQL statements
    cursor.execute('''DELETE FROM [dbo].[HyperlinkIdInArticle]
                      DELETE FROM [dbo].[Hyperlink]
                      DBCC CHECKIDENT ('dbo.Hyperlink', RESEED, 0) 
                      DELETE FROM [dbo].[Article]
                      DBCC CHECKIDENT ('dbo.Article', RESEED, 0)  
                      DELETE FROM [dbo].[Author]''')
    connection.commit()
    # Close the cursor
    cursor.close()

# The entry point of the program
if __name__ == "__main__":
    # 1, Initizlize variables
    # Specify server name, database, user and password
    server = '192.168.2.17'
    database = 'Demo'
    user = 'user'
    password = 'password'
    # Specify the author name for data collection
    author = 'Nai Biao Zhou'
    # Specify the authors page url
    authorsPageUrl = 'https://www.mssqltips.com/sql-server-mssqltips-authors/'
    # 2, Remove all testing data in the database
    connection = createDBConnection(server,database,user,password)
    clearDatabase(connection)
    closeDBConnection(connection)
    # 3, Extract the Author's data and save them into the database
    authorProfileUrl, authorSinceYear, tipContribution = findAuthor(author, authorsPageUrl)
    connection = createDBConnection(server,database,user,password)
    saveAuthor(connection, author, authorProfileUrl, authorSinceYear, tipContribution)
    closeDBConnection(connection)
    # 4, Extract the list of author's articles and save them into the database
    authorArticles = findAuthorArticles(author, authorProfileUrl)
    connection = createDBConnection(server,database,user,password)
    saveAuthorArticles(connection, authorArticles)
    closeDBConnection(connection)
    # 5, Extract the all hyperlinks in author's articles and save them into the database
    for article in authorArticles:
        hyperLinks = findAllHyperLinks(article[2])
        connection = createDBConnection(server,database,user,password)
        saveArticleHyperLinks(connection, article[2], hyperLinks)
        closeDBConnection(connection)
    # 6, Read the all hyperlinks in author's articles from the database
    connection = createDBConnection(server,database,user,password)
    rows = findAllHyperlinkUrlbyAuthor(connection,author)
    closeDBConnection(connection)
    # 7, Update the accessbility in the datatabse
    for row in rows:
        hyperlinkUrl = row.HyperlinkUrl     # Access by name    
        statusCode, responseMessage, visitedDate = testHyperlinkAccessibility(hyperlinkUrl)
        print('checking: ',hyperlinkUrl)
        connection = createDBConnection(server,database,user,password)
        updateHyperlinkAccessibility(connection, hyperlinkUrl, statusCode, responseMessage, visitedDate)
        closeDBConnection(connection)
    # 8, Output summary report
    connection = createDBConnection(server,database,user,password)
    getSummary(connection)
    closeDBConnection(connection)