Merge lp:~vkolesnikov/pbxt/pbxt-07-low-disk-index-flush into lp:pbxt/1.0.07-rc

Proposed by Vladimir Kolesnikov
Status: Rejected
Rejected by: Paul McCullagh
Proposed branch: lp:~vkolesnikov/pbxt/pbxt-07-low-disk-index-flush
Merge into: lp:pbxt/1.0.07-rc
Diff against target: None lines
To merge this branch: bzr merge lp:~vkolesnikov/pbxt/pbxt-07-low-disk-index-flush
Reviewer Review Type Date Requested Status
Paul McCullagh Abstain
Review via email: mp+10141@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Paul McCullagh (paul-mccullagh) wrote :
Download full text (4.8 KiB)

Hi Vlad,

There is a problem with this change. Actually a serious bug.

See below:

On Aug 14, 2009, at 11:30 AM, Vladimir Kolesnikov wrote:

> Vladimir Kolesnikov has proposed merging lp:~vkolesnikov/pbxt/
> pbxt-07-low-disk-index-flush into lp:pbxt/1.0.07-rc.
>
> Requested reviews:
> PBXT Core (pbxt-core)
> --
> https://code.launchpad.net/~vkolesnikov/pbxt/pbxt-07-low-disk-index-
> flush/+merge/10141
> Your team PBXT Core is subscribed to branch lp:pbxt/1.0.07-rc.
> === modified file 'ChangeLog'
> --- ChangeLog 2009-08-11 09:17:53 +0000
> +++ ChangeLog 2009-08-14 09:23:20 +0000
> @@ -1,6 +1,10 @@
> PBXT Release Notes
> ==================
>
> +------- 1.0.07r RC - 2009-08-14
> +
> +RN267: Fixed an assertion failure: if checkpointer failed to
> successully flush indices (e.g. due to low disk state) then next
> time xt_flush_indices is called it would generate an assertion failure
> +
> RN266: Fixed a crash: when initialization failed with an exception
> THD structure was released twice
>
> ------- 1.0.07q RC - 2009-08-09
>
> === modified file 'src/index_xt.cc'
> --- src/index_xt.cc 2009-07-07 14:21:07 +0000
> +++ src/index_xt.cc 2009-08-13 13:49:21 +0000
> @@ -2785,7 +2785,6 @@
> *bytes_flushed += (dirty_blocks * XT_INDEX_PAGE_SIZE);
>
> curr_flush_seq = tab->tab_ind_flush_seq;
> - tab->tab_ind_flush_seq++;
>
> /* Write the dirty pages: */
> indp = tab->tab_dic.dic_keys;
> @@ -2981,6 +2980,13 @@
> }
> }
>
> + /* At this point all dirty blocks should have been successully
> flushed to disk,
> + * otherwise we must not increment this value as an assertion
> above in this
> + * function ASSERT_NS(block->cp_flush_seq == curr_flush_seq)
> + * will fail.
> + */
> + tab->tab_ind_flush_seq++;
> +
> indp = tab->tab_dic.dic_keys;
> for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
> ind = *indp;

I think you have moved tab_ind_flush_seq++ too far down. All locks on
the index are released before the if (wrote_something) block:

 indp = tab->tab_dic.dic_keys;
 for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
  ind = *indp;
  XT_INDEX_UNLOCK(ind, ot);
 }

 if (wrote_something) {
            ....

After that point, the index can be updated again. Then it is important
that the updates use a different flush sequence number. If not, the
page will be freed by the code that occurred just above this point:

  /* Free up flushed pages: */
  indp = tab->tab_dic.dic_keys;
  for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
                    ...

And the changes in the cache will be lost!

>
> === modified file 'src/lock_xt.h'
> --- src/lock_xt.h 2008-11-17 10:14:17 +0000
> +++ src/lock_xt.h 2009-08-13 13:49:21 +0000
> @@ -234,6 +234,38 @@
> return val;
> }
>
> +inline void xt_atomic_inc4(volatile xtWord4 *mptr)
> +{
> +#ifdef XT_ATOMIC_WIN32_X86
> + __asm MOV ECX, mptr
> + __asm LOCK INC DWORD PTR [ECX]
> +#elif defined(XT_ATOMIC_GNUC_X86)
> + asm volatile ("lock; incl %0" : : "m" (*mptr) : "memory");
> +#elif defined(XT_ATOMIC_GCC_OPS)
> + __sync_fetch_and_add(mptr, 1);
> +#elif defined(XT_ATOMIC_SOLARIS_LIB)
> + atomic_inc_32_nv(mptr);
> +#else
> + ++(*mptr);
> +#endif
> +}
> +
> +inline void x...

Read more...

Revision history for this message
Vladimir Kolesnikov (vkolesnikov) wrote :

Hi Paul,

I pushed fix update. Please check the latest rev.

558. By Vladimir Kolesnikov

out-of-disk handling improvements: keep and retry ilog

Revision history for this message
Paul McCullagh (paul-mccullagh) wrote :
Download full text (3.2 KiB)

Hi Vlad,

Some points to look at. I have pasted the code below, and added
references.

(1) The variable "curr_flush_seq" is now only used for debug code. It
should probably be place in: #ifdef DEBUG

(2) The tab_ind_flush_lock should not be held across calls to this
function, because there is no guarantee that the thread that failed
the first flush will return.

(3) goto failed_2 here is way to early to save the il!

(4) the il should be saved at this point. I recommend, setting tab-
 >tab_ind_unflushed_ilog = il at this point, and setting, tab-
 >tab_ind_unflushed_ilog = NULL, later after success.

(5) This must be done before the checkpoint is ended.

-----------------------------
(1) xtWord2 curr_flush_seq;
 XTIndFreeListPtr list_ptr;
 u_int dirty_blocks;
 XTCheckPointTablePtr cp_tab;
 XTCheckPointStatePtr cp = NULL;
 xtBool retry_old_ilog = FALSE;

 restart:
 if (!xt_begin_checkpoint(tab->tab_db, have_table_lock, ot->ot_thread))
  return FAILED;

 if (tab->tab_ind_unflushed_ilog) {
  wrote_something = TRUE;
  retry_old_ilog = TRUE;
  il = tab->tab_ind_unflushed_ilog;
  goto previous_ilog;
 }

#ifdef DEBUG_CHECK_IND_CACHE
 xt_ind_check_cache(NULL);
#endif
 xt_lock_mutex_ns(&tab->tab_ind_flush_lock);

 if (!tab->tab_db->db_indlogs.ilp_get_log(&il, ot->ot_thread))
  goto failed_3;

 il->il_reset(tab);
 if (!il->il_write_byte(ot, XT_DT_FREE_LIST))
  goto failed_2;
 if (!il->il_write_word4(ot, tab->tab_id))
  goto failed_2;
 if (!il->il_write_word4(ot, 0))
(3) goto failed_2;

----------------------

 previous_ilog:
 if (wrote_something) {
(4) tab->tab_ind_unflushed_ilog = il;
  il = NULL;

----------------------
 if (!retry_old_ilog)
  xt_unlock_mutex_ns(&tab->tab_ind_flush_lock);

#ifdef DEBUG_CHECK_IND_CACHE
 xt_ind_check_cache((XTIndex *) 1);
#endif
#ifdef TRACE_FLUSH
 printf("FLUSH --end-- %s\n", tab->tab_name->ps_path);
 fflush(stdout);
#endif
 if (!xt_end_checkpoint(tab->tab_db, ot->ot_thread, NULL))
  return FAILED;

(5) if (retry_old_ilog) {
  tab->tab_ind_unflushed_ilog = NULL;
  retry_old_ilog = FALSE;
  goto restart;
 }

 return OK;

 failed:
 indp = tab->tab_dic.dic_keys;
 for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
  ind = *indp;
  XT_INDEX_UNLOCK(ind, ot);
 }

 /* we failed with a partially written ilog. keep it and try to write
it next time this function is called */
 failed_2:
 ASSERT_NS(il);
 /* either there was no unflushed ilog, or our current ilog is the
unflushed one */
 ASSERT_NS(tab->tab_ind_unflushed_ilog == NULL || tab-
 >tab_ind_unflushed_ilog == il);
 tab->tab_ind_unflushed_ilog = il;

 /* this label is for the case when there's no [yet] ilog, otherwise
must preserve ilog to be flushed later
  * (see failed_2)
  */
 failed_3:
(2) if (!retry_old_ilog)
  xt_unlock_mutex_ns(&tab->tab_ind_flush_lock);
#ifdef DEBUG_CHECK_IND_CACHE
 xt_ind_check_cache(NULL);
#endif
 return FAILED;
---------------------------

On Aug 18, 2009, at 10:40 PM, Vladimir Kolesnikov wrote:

> Hi Paul,
>
> I pushed fix update. Please check the latest rev.
> --
> https://code.launchpad.net/~vkolesnikov/pbxt/pbxt-07-low-disk-index-
> flush/+merge/10141
> Your team PBXT Core is subscribed to branch lp:p...

Read more...

Revision history for this message
Paul McCullagh (paul-mccullagh) wrote :

And one other thing:

tab_ind_unflushed_ilog must be freed when the table object is freed.

On Aug 18, 2009, at 10:40 PM, Vladimir Kolesnikov wrote:

> Hi Paul,
>
> I pushed fix update. Please check the latest rev.
> --
> https://code.launchpad.net/~vkolesnikov/pbxt/pbxt-07-low-disk-index-
> flush/+merge/10141
> Your team PBXT Core is subscribed to branch lp:pbxt/1.0.07-rc.

--
Paul McCullagh
PrimeBase Technologies
www.primebase.org
www.blobstreaming.org
pbxt.blogspot.com

559. By Vladimir Kolesnikov

more improvements for the index flush fix

Revision history for this message
Paul McCullagh (paul-mccullagh) wrote :

Hi Vlad,

The patch is looking a lot better. Some more comments:

xt_xlog_flush_log() should not return FALSE unless it raises an error.

So I think we just have to return OK, although the flush is not done. I have a better solution for this in 1.0.08 now, I pass in the db pointer to the function.

xtPublic xtBool xt_xlog_flush_log(XTThreadPtr thread)
{
 if (thread->st_database)
  return thread->st_database->db_xlog.xlog_flush(thread);
 else
  return FALSE;
}

"il_flush_seq = tab->tab_ind_flush_seq" cannot be done here. It must rather be done where the old assignment ("curr_flush_seq = tab->tab_ind_flush_seq") took place. This is after all indexes have been locked:

void XTIndexLog::il_reset(XTTable *tab)
{
 il_tab_id = tab->tab_id;
 il_log_eof = 0;
 il_buffer_len = 0;
 il_buffer_offset = 0;
 il_flush_seq = tab->tab_ind_flush_seq;
}

I don't like "ilp_db = NULL". Could we rather move the ilp_exit() to after xt_tab_exit_db(). If that works then add a comment to ensure that we know that xt_tab_exit_db() is dependent on db->db_indlogs.

void XTIndexLogPool::ilp_exit(struct XTThread *self)
{
 ilp_close(self, FALSE);
 ASSERT_NS(il_pool_count == 0);
 xt_free_mutex(&ilp_lock);
 ilp_db = NULL; /* reset backpointer to flag the "exit" state */
}

lock_xt.h: Something is wrong with the line endings.

review: Needs Fixing
Revision history for this message
Paul McCullagh (paul-mccullagh) wrote :

This fix has been superseded by lp:~vkolesnikov/pbxt/pbxt-ilog-out-of-disk.

review: Abstain

Unmerged revisions

559. By Vladimir Kolesnikov

more improvements for the index flush fix

558. By Vladimir Kolesnikov

out-of-disk handling improvements: keep and retry ilog

557. By Vladimir Kolesnikov

updated version to 1.0.07r

556. By Vladimir Kolesnikov

updated ChangeLog

555. By Vladimir Kolesnikov

fixed an checkpointer index flush problem when running out of disk space

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'ChangeLog'
2--- ChangeLog 2009-08-11 09:17:53 +0000
3+++ ChangeLog 2009-08-14 09:23:20 +0000
4@@ -1,6 +1,10 @@
5 PBXT Release Notes
6 ==================
7
8+------- 1.0.07r RC - 2009-08-14
9+
10+RN267: Fixed an assertion failure: if checkpointer failed to successully flush indices (e.g. due to low disk state) then next time xt_flush_indices is called it would generate an assertion failure
11+
12 RN266: Fixed a crash: when initialization failed with an exception THD structure was released twice
13
14 ------- 1.0.07q RC - 2009-08-09
15
16=== modified file 'src/index_xt.cc'
17--- src/index_xt.cc 2009-07-07 14:21:07 +0000
18+++ src/index_xt.cc 2009-08-13 13:49:21 +0000
19@@ -2785,7 +2785,6 @@
20 *bytes_flushed += (dirty_blocks * XT_INDEX_PAGE_SIZE);
21
22 curr_flush_seq = tab->tab_ind_flush_seq;
23- tab->tab_ind_flush_seq++;
24
25 /* Write the dirty pages: */
26 indp = tab->tab_dic.dic_keys;
27@@ -2981,6 +2980,13 @@
28 }
29 }
30
31+ /* At this point all dirty blocks should have been successully flushed to disk,
32+ * otherwise we must not increment this value as an assertion above in this
33+ * function ASSERT_NS(block->cp_flush_seq == curr_flush_seq)
34+ * will fail.
35+ */
36+ tab->tab_ind_flush_seq++;
37+
38 indp = tab->tab_dic.dic_keys;
39 for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
40 ind = *indp;
41
42=== modified file 'src/lock_xt.h'
43--- src/lock_xt.h 2008-11-17 10:14:17 +0000
44+++ src/lock_xt.h 2009-08-13 13:49:21 +0000
45@@ -234,6 +234,38 @@
46 return val;
47 }
48
49+inline void xt_atomic_inc4(volatile xtWord4 *mptr)
50+{
51+#ifdef XT_ATOMIC_WIN32_X86
52+ __asm MOV ECX, mptr
53+ __asm LOCK INC DWORD PTR [ECX]
54+#elif defined(XT_ATOMIC_GNUC_X86)
55+ asm volatile ("lock; incl %0" : : "m" (*mptr) : "memory");
56+#elif defined(XT_ATOMIC_GCC_OPS)
57+ __sync_fetch_and_add(mptr, 1);
58+#elif defined(XT_ATOMIC_SOLARIS_LIB)
59+ atomic_inc_32_nv(mptr);
60+#else
61+ ++(*mptr);
62+#endif
63+}
64+
65+inline void xt_atomic_dec4(volatile xtWord4 *mptr)
66+{
67+#ifdef XT_ATOMIC_WIN32_X86
68+ __asm MOV ECX, mptr
69+ __asm LOCK DEC DWORD PTR [ECX]
70+#elif defined(XT_ATOMIC_GNUC_X86)
71+ asm volatile ("lock; decl %0" : : "m" (*mptr) : "memory");
72+#elif defined(XT_ATOMIC_GCC_OPS)
73+ __sync_fetch_and_sub(mptr, 1);
74+#elif defined(XT_ATOMIC_SOLARIS_LIB)
75+ atomic_dec_32_nv(mptr);
76+#else
77+ --(*mptr);
78+#endif
79+}
80+
81 inline void xt_atomic_set4(volatile xtWord4 *mptr, xtWord4 val)
82 {
83 #ifdef XT_SPL_WIN32_ASM
84
85=== modified file 'src/strutil_xt.cc'
86--- src/strutil_xt.cc 2009-08-09 15:46:45 +0000
87+++ src/strutil_xt.cc 2009-08-14 09:23:20 +0000
88@@ -367,7 +367,7 @@
89
90 xtPublic c_char *xt_get_version(void)
91 {
92- return "1.0.07q RC";
93+ return "1.0.07r RC";
94 }
95
96 /* Copy and URL decode! */
97
98=== modified file 'src/tabcache_xt.cc'
99--- src/tabcache_xt.cc 2009-01-30 10:23:54 +0000
100+++ src/tabcache_xt.cc 2009-08-13 13:49:21 +0000
101@@ -345,7 +345,7 @@
102 if (!tc_fetch(file, ref_id, &seg, &page, offset, thread))
103 return FAILED;
104 #endif
105- page->tcp_lock_count++;
106+ xt_atomic_inc4(&page->tcp_lock_count);
107 xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
108 *ret_page = page;
109 return OK;
110@@ -376,7 +376,7 @@
111 #endif
112
113 if (page->tcp_lock_count > 0)
114- page->tcp_lock_count--;
115+ xt_atomic_dec4(&page->tcp_lock_count);
116
117 xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
118 }

Subscribers

People subscribed via source and target branches

to all changes: