Monday, June 22, 2015

BCS Models - Part 6: How to eat this elephant?

This post is part of an eight part series describing the process I have followed to be able create a BCS model capable of indexing well over 10 million items.

Series Index:
BCS Models - Part 1: Target Database
BCS Models - Part 2: Initial BCS External Content Types
BCS Models - Part 3: Crawl Results
BCS Models - Part 4: Bigger Database
BCS Models - Part 5: The Bigger Database
BCS Models - Part 6: How to eat this elephant? <-- You are here
BCS Models - Part 7: Changes to the BCS Model to support segmented crawl
BCS Models - Part 8: Crawl Results

In BCS Models - Part 5: The Bigger Database I decided to create a model to be able to crawl a database created from a StackOverflow extract.

The key to this solution is to come up with a way to take smaller bites out of this elephant.  I chose a rather big hammer approach.  I decided to segment the uv_All% views into 100,000 row segments using these tables.

This SegmentType table documents what the segment types will be.
Create Table SegmentType (
id integer not null identity primary key clustered,
segmentType varchar(20) not null
)
go

This Segment table holds all the segments that are needed.  Each row will hold the upper and lower IDs for each segment.
create table Segment (
segmentTypeID integer not null references SegmentType(id),
segmentNumber integer not null, 
lowerID integer not null,
upperID integer not null
) 
go
create unique clustered index IX1 on Segment(segmentTypeID, segmentNumber)
go

I used this little bit of DML to populate the Segments Tables:
insert into SegmentType(segmentType) values('Users'),('Questions'),('Responses'),('Answers'),('QuestionComments'),
('ResponseComments'),('AnswerComments')
go

I then used this T-SQL to populate the Segments table, running it for each of the values specified in the SegmentType table and changing the view:
set nocount on
declare @segmentType varchar(20) = 'Users'
declare @segmentNumber integer = 0
declare @segmentTypeID int
select @segmentTypeID = id from SegmentType where segmenttype = @segmentType
declare @lowerID integer = 0
declare @upperID integer = 0

while (1=1) begin
set @lowerID = @upperID

select @lowerID = min(ID), @upperID = Max(ID) from (
--
--  Change ID to CommentID for the Comments views
select top 100000 ID as ID
  from uv_AllUsers 
where ID > @lowerID
order by id
) a
if @upperID is null
break
insert into Segment(segmentTypeID, segmentNumber, lowerID, UpperID) 
values (@segmentTypeID, @segmentNumber, @lowerID, @upperID)
print 'Segment: ' + convert(varchar,@segmentNumber) + 
  ' LowerID: ' + convert(varchar,@lowerID) + 
  ' UpperID: ' + convert(varchar,@UpperID)
set @segmentNumber = @segmentNumber + 1
end

There's probably better ways to do that, so if you're doing it for real, it should be optimized.

Then I created the stored procedures to bring in the segments by View:
create procedure usp_getQuestionSegments as 
SELECT s.segmentTypeid,
s.segmentNumber,
s.lowerID,
s.upperID
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Questions'
go
grant execute on usp_getQuestionSegments to spSearchCrawl
go

create procedure usp_getAnswerSegments as 
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Answers'
go
grant execute on usp_getAnswerSegments to spSearchCrawl
go

create procedure usp_getResponseSegments as 
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Responses'
go
grant execute on usp_getResponseSegments to spSearchCrawl
go

create procedure usp_getQuestionCommentSegments as 
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'QuestionComments'
go
grant execute on usp_getQuestionCommentSegments to spSearchCrawl
go

create procedure usp_getAnswerCommentSegments as 
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'AnswerComments'
go
grant execute on usp_getAnswerCommentSegments to spSearchCrawl
go

create procedure usp_getResponseCommentSegments as 
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'ResponseComments'
go
grant execute on usp_getResponseCommentSegments to spSearchCrawl
go

create procedure usp_getUserSegments as 
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Users'
go
grant execute on usp_getUserSegments to spSearchCrawl
go

Procedures to bring back a single segment of each type.  The additional column is in support of incremental crawls:
create procedure usp_getQuestionSegment @segmentNumber integer as 
SELECT s.segmentTypeid,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Questions'
  and s.segmentNumber = @segmentNumber
go
grant execute on usp_getQuestionSegment to spSearchCrawl
go

create procedure usp_getAnswerSegment @segmentNumber integer as 
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Answers'
  and s.segmentNumber = @segmentNumber
go
grant execute on usp_getAnswerSegment to spSearchCrawl
go

create procedure usp_getResponseSegment @segmentNumber integer as 
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Responses'
  and s.segmentNumber = @segmentNumber
go
grant execute on usp_getResponseSegment to spSearchCrawl
go

create procedure usp_getQuestionCommentSegment @segmentNumber integer as 
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'QuestionComments'
  and s.segmentNumber = @segmentNumber
go
grant execute on usp_getQuestionCommentSegment to spSearchCrawl
go

create procedure usp_getAnswerCommentSegment @segmentNumber integer as 
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'AnswerComments'
  and s.segmentNumber = @segmentNumber
go
grant execute on usp_getAnswerCommentSegment to spSearchCrawl
go

create procedure usp_getResponseCommentSegment @segmentNumber integer as 
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'ResponseComments'
  and s.segmentNumber = @segmentNumber
go
grant execute on usp_getResponseCommentSegment to spSearchCrawl
go

create procedure usp_getUserSegment @segmentNumber integer as 
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
  FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Users'
  and s.segmentNumber = @segmentNumber
go
grant execute on usp_getUserSegment to spSearchCrawl

go



Now I need stored procedures to bring back the rows for based upon the lower and upper IDs in each segment.

Here they are:
CREATE procedure [dbo].[usp_getAcceptedAnswersBySegment] @segmentNumber integer, @lastRunDate datetime as
  SELECT aaa.ID,
ParentId,
AnswerCount,
Body,
ClosedDate,
CommentCount,
CommunityOwnedDate,
CreationDate,
FavoriteCount,
LastActivityDate,
coalesce(LastEditDate,CreationDate) LastEditDate,
LastEditorDisplayName,
Score,
Tags,
Title,
ViewCount,
DisplayName,
PostType,
@segmentNumber segmentNumber
    FROM uv_AllAcceptedAnswers aaa
join segment s on aaa.id >= s.lowerID and aaa.id <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
   where st.segmentType = 'Answers'
and coalesce(LastEditDate,CreationDate) > @lastRunDate
and s.segmentNumber = @segmentNumber


go
grant execute on usp_getAcceptedAnswerBySegment to spSearchCrawl
go

CREATE procedure [dbo].[usp_getQuestionsBySegment] @segmentNumber integer, @lastRunDate datetime as
select aq.ID,
ParentId,
AnswerCount,
Body,
ClosedDate,
CommentCount,
CommunityOwnedDate,
CreationDate,
FavoriteCount,
LastActivityDate,
coalesce(LastEditDate,CreationDate) LastEditDate,
LastEditorDisplayName,
Score,
Tags,
Title,
ViewCount,
DisplayName,
PostType,
  @segmentNumber segmentNumber
from uv_AllQuestions aq
join segment s on aq.id >= s.lowerID and aq.id <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
   where st.segmentType = 'Questions'
and coalesce(LastEditDate,CreationDate) > @lastRunDate
and s.segmentNumber = @segmentNumber


go
grant execute on usp_getQuestionsBySegment to spSearchCrawl
go
CREATE procedure [dbo].[usp_getResponsesBySegment] @segmentNumber integer, @lastRunDate datetime as
select ar.ID,
ParentId,
AnswerCount,
Body,
ClosedDate,
CommentCount,
CommunityOwnedDate,
CreationDate,
FavoriteCount,
LastActivityDate,
coalesce(LastEditDate,CreationDate) LastEditDate,
LastEditorDisplayName,
Score,
Tags,
Title,
ViewCount,
DisplayName,
PostType,
  @segmentNumber segmentNumber
from uv_AllResponses ar
join segment s on ar.id >= s.lowerID and ar.id <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
   where st.segmentType = 'Responses'
and coalesce(LastEditDate,CreationDate) > @lastRunDate

and s.segmentNumber = @segmentNumber

go
grant execute on usp_getResponsesBySegment to spSearchCrawl
go
CREATE procedure [dbo].[usp_getQuestionCommentsBySegment] @segmentNumber integer, @lastRunDate datetime as
select CommentID,
CreationDate,
PostId,
Score,
[Text],
UserId,
DisplayName,
  @segmentNumber segmentNumber
from uv_AllQuestionComments qc
join segment s on Commentid >= s.lowerID and Commentid <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
   where st.segmentType = 'QuestionComments'
and CreationDate > @lastRunDate

and s.segmentNumber = @segmentNumber

go
grant execute on usp_getQuestionCommentsBySegment to spSearchCrawl
go
CREATE procedure [dbo].[usp_getResponseCommentsBySegment] @segmentNumber integer, @lastRunDate datetime as
select CommentID,
CreationDate,
PostId,
Score,
[Text],
UserId,
DisplayName,
  @segmentNumber segmentNumber
from uv_AllResponseComments
join segment s on Commentid >= s.lowerID and Commentid <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
   where st.segmentType = 'ResponseComments'
and CreationDate > @lastRunDate

and s.segmentNumber = @segmentNumber

go
grant execute on usp_getReponseCommentsBySegment to spSearchCrawl
go
CREATE procedure [dbo].[usp_getAnswerCommentsBySegment] @segmentNumber integer, @lastRunDate datetime as
select ac.CommentID,
CreationDate,
PostId,
Score,
[Text],
UserId,
DisplayName,
  @segmentNumber segmentNumber
from uv_AllAnswerComments ac
join segment s on ac.Commentid >= s.lowerID and ac.Commentid <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
   where st.segmentType = 'AnswerComments'
and CreationDate > @lastRunDate

and s.segmentNumber = @segmentNumber

go
grant execute on usp_getAnswerCommentsBySegment to spSearchCrawl
go

CREATE procedure [dbo].[usp_getUsersBySegment] @segmentNumber integer, @lastRunDate datetime as
select au.Id,
          AboutMe,
          Age,
          CreationDate,
          DisplayName,
          DownVotes,
          LastAccessDate,
          Location,
          Reputation,
          UpVotes,
          [Views],
          WebsiteUrl,
  @segmentNumber segmentNumber
from uv_AllUsers au
join segment s on au.id >= s.lowerID and au.id <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
   where st.segmentType = 'Users'
and CreationDate > @lastRunDate

and s.segmentNumber = @segmentNumber

go
grant execute on usp_getUsersBySegment to spSearchCrawl
go

Next up is the actual BCS Module to bring all this together. 

No comments: