Series Index:
BCS Models - Part 1: Target Database
BCS Models - Part 2: Initial BCS External Content Types
BCS Models - Part 3: Crawl Results
BCS Models - Part 4: Bigger Database
BCS Models - Part 5: The Bigger Database
BCS Models - Part 6: How to eat this elephant? <-- You are here
BCS Models - Part 7: Changes to the BCS Model to support segmented crawl
BCS Models - Part 8: Crawl Results
The key to this solution is to come up with a way to take smaller bites out of this elephant. I chose a rather big hammer approach. I decided to segment the uv_All% views into 100,000 row segments using these tables.
This SegmentType table documents what the segment types will be.
Create Table SegmentType (
id integer not null identity primary key clustered,
segmentType varchar(20) not null
)
go
This Segment table holds all the segments that are needed. Each row will hold the upper and lower IDs for each segment.
create table Segment (
segmentTypeID integer not null references SegmentType(id),
segmentNumber integer not null,
lowerID integer not null,
upperID integer not null
)
go
create unique clustered index IX1 on Segment(segmentTypeID, segmentNumber)
go
insert into SegmentType(segmentType) values('Users'),('Questions'),('Responses'),('Answers'),('QuestionComments'),
('ResponseComments'),('AnswerComments')
go
set nocount on
declare @segmentType varchar(20) = 'Users'
declare @segmentNumber integer = 0
declare @segmentTypeID int
select @segmentTypeID = id from SegmentType where segmenttype = @segmentType
declare @lowerID integer = 0
declare @upperID integer = 0
while (1=1) begin
set @lowerID = @upperID
select @lowerID = min(ID), @upperID = Max(ID) from (
--
-- Change ID to CommentID for the Comments views
select top 100000 ID as ID
from uv_AllUsers
where ID > @lowerID
order by id
) a
if @upperID is null
break
insert into Segment(segmentTypeID, segmentNumber, lowerID, UpperID)
values (@segmentTypeID, @segmentNumber, @lowerID, @upperID)
print 'Segment: ' + convert(varchar,@segmentNumber) +
' LowerID: ' + convert(varchar,@lowerID) +
' UpperID: ' + convert(varchar,@UpperID)
set @segmentNumber = @segmentNumber + 1
end
Then I created the stored procedures to bring in the segments by View:
create procedure usp_getQuestionSegments as
SELECT s.segmentTypeid,
s.segmentNumber,
s.lowerID,
s.upperID
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Questions'
go
grant execute on usp_getQuestionSegments to spSearchCrawl
go
create procedure usp_getAnswerSegments as
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Answers'
go
grant execute on usp_getAnswerSegments to spSearchCrawl
go
create procedure usp_getResponseSegments as
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Responses'
go
grant execute on usp_getResponseSegments to spSearchCrawl
go
create procedure usp_getQuestionCommentSegments as
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'QuestionComments'
go
grant execute on usp_getQuestionCommentSegments to spSearchCrawl
go
create procedure usp_getAnswerCommentSegments as
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'AnswerComments'
go
grant execute on usp_getAnswerCommentSegments to spSearchCrawl
go
create procedure usp_getResponseCommentSegments as
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'ResponseComments'
go
grant execute on usp_getResponseCommentSegments to spSearchCrawl
go
create procedure usp_getUserSegments as
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Users'
go
grant execute on usp_getUserSegments to spSearchCrawl
go
Procedures to bring back a single segment of each type. The additional column is in support of incremental crawls:
create procedure usp_getQuestionSegment @segmentNumber integer as
SELECT s.segmentTypeid,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Questions'
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getQuestionSegment to spSearchCrawl
go
create procedure usp_getAnswerSegment @segmentNumber integer as
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Answers'
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getAnswerSegment to spSearchCrawl
go
create procedure usp_getResponseSegment @segmentNumber integer as
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Responses'
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getResponseSegment to spSearchCrawl
go
create procedure usp_getQuestionCommentSegment @segmentNumber integer as
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'QuestionComments'
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getQuestionCommentSegment to spSearchCrawl
go
create procedure usp_getAnswerCommentSegment @segmentNumber integer as
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'AnswerComments'
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getAnswerCommentSegment to spSearchCrawl
go
create procedure usp_getResponseCommentSegment @segmentNumber integer as
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'ResponseComments'
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getResponseCommentSegment to spSearchCrawl
go
create procedure usp_getUserSegment @segmentNumber integer as
SELECT s.segmentTypeID,
s.segmentNumber,
s.lowerID,
s.upperID,
cast( 0 as bigint) as DeletedCount
FROM segment s
JOIN segmentType st ON st.id = s.segmentTypeID
WHERE st.segmentType = 'Users'
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getUserSegment to spSearchCrawl
go
Here they are:
CREATE procedure [dbo].[usp_getAcceptedAnswersBySegment] @segmentNumber integer, @lastRunDate datetime as
SELECT aaa.ID,
ParentId,
AnswerCount,
Body,
ClosedDate,
CommentCount,
CommunityOwnedDate,
CreationDate,
FavoriteCount,
LastActivityDate,
coalesce(LastEditDate,CreationDate) LastEditDate,
LastEditorDisplayName,
Score,
Tags,
Title,
ViewCount,
DisplayName,
PostType,
@segmentNumber segmentNumber
FROM uv_AllAcceptedAnswers aaa
join segment s on aaa.id >= s.lowerID and aaa.id <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
where st.segmentType = 'Answers'
and coalesce(LastEditDate,CreationDate) > @lastRunDate
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getAcceptedAnswerBySegment to spSearchCrawl
go
CREATE procedure [dbo].[usp_getQuestionsBySegment] @segmentNumber integer, @lastRunDate datetime as
select aq.ID,
ParentId,
AnswerCount,
Body,
ClosedDate,
CommentCount,
CommunityOwnedDate,
CreationDate,
FavoriteCount,
LastActivityDate,
coalesce(LastEditDate,CreationDate) LastEditDate,
LastEditorDisplayName,
Score,
Tags,
Title,
ViewCount,
DisplayName,
PostType,
@segmentNumber segmentNumber
from uv_AllQuestions aq
join segment s on aq.id >= s.lowerID and aq.id <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
where st.segmentType = 'Questions'
and coalesce(LastEditDate,CreationDate) > @lastRunDate
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getQuestionsBySegment to spSearchCrawl
go
CREATE procedure [dbo].[usp_getResponsesBySegment] @segmentNumber integer, @lastRunDate datetime as
select ar.ID,
ParentId,
AnswerCount,
Body,
ClosedDate,
CommentCount,
CommunityOwnedDate,
CreationDate,
FavoriteCount,
LastActivityDate,
coalesce(LastEditDate,CreationDate) LastEditDate,
LastEditorDisplayName,
Score,
Tags,
Title,
ViewCount,
DisplayName,
PostType,
@segmentNumber segmentNumber
from uv_AllResponses ar
join segment s on ar.id >= s.lowerID and ar.id <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
where st.segmentType = 'Responses'
and coalesce(LastEditDate,CreationDate) > @lastRunDate
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getResponsesBySegment to spSearchCrawl
go
CREATE procedure [dbo].[usp_getQuestionCommentsBySegment] @segmentNumber integer, @lastRunDate datetime as
select CommentID,
CreationDate,
PostId,
Score,
[Text],
UserId,
DisplayName,
@segmentNumber segmentNumber
from uv_AllQuestionComments qc
join segment s on Commentid >= s.lowerID and Commentid <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
where st.segmentType = 'QuestionComments'
and CreationDate > @lastRunDate
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getQuestionCommentsBySegment to spSearchCrawl
go
CREATE procedure [dbo].[usp_getResponseCommentsBySegment] @segmentNumber integer, @lastRunDate datetime as
select CommentID,
CreationDate,
PostId,
Score,
[Text],
UserId,
DisplayName,
@segmentNumber segmentNumber
from uv_AllResponseComments
join segment s on Commentid >= s.lowerID and Commentid <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
where st.segmentType = 'ResponseComments'
and CreationDate > @lastRunDate
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getReponseCommentsBySegment to spSearchCrawl
go
CREATE procedure [dbo].[usp_getAnswerCommentsBySegment] @segmentNumber integer, @lastRunDate datetime as
select ac.CommentID,
CreationDate,
PostId,
Score,
[Text],
UserId,
DisplayName,
@segmentNumber segmentNumber
from uv_AllAnswerComments ac
join segment s on ac.Commentid >= s.lowerID and ac.Commentid <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
where st.segmentType = 'AnswerComments'
and CreationDate > @lastRunDate
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getAnswerCommentsBySegment to spSearchCrawl
go
CREATE procedure [dbo].[usp_getUsersBySegment] @segmentNumber integer, @lastRunDate datetime as
select au.Id,
AboutMe,
Age,
CreationDate,
DisplayName,
DownVotes,
LastAccessDate,
Location,
Reputation,
UpVotes,
[Views],
WebsiteUrl,
@segmentNumber segmentNumber
from uv_AllUsers au
join segment s on au.id >= s.lowerID and au.id <= s.upperID
join SegmentType st on st.id = s.segmentTypeID
where st.segmentType = 'Users'
and CreationDate > @lastRunDate
and s.segmentNumber = @segmentNumber
go
grant execute on usp_getUsersBySegment to spSearchCrawl
go
Next up is the actual BCS Module to bring all this together.
No comments:
Post a Comment