/*
* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
* Author: Robert Smith, Jonathan Kans, Michael Kornbluh
*
* File Description:
*   Basic and Extended Cleanup of CSeq_entries, etc.
*
* ===========================================================================
*/

// All this functionality is packed into this one file for ease of 
// searching.  If it gets big enough, it will be broken up in the future.

#include <ncbi_pch.hpp>

#include <corelib/ncbi_autoinit.hpp>

#include <objects/misc/sequence_macros.hpp>
#include <objmgr/annot_ci.hpp>
#include <objmgr/feat_ci.hpp>
#include <objmgr/seqdesc_ci.hpp>
#include <objmgr/scope.hpp>
#include <objmgr/util/seq_loc_util.hpp>
#include <objtools/cleanup/cleanup_change.hpp>

#include <objtools/cleanup/cleanup.hpp>
#include "newcleanupp.hpp"

#include "cleanup_utils.hpp"

#include <objmgr/bioseq_ci.hpp>
#include <objmgr/object_manager.hpp>
#include <objmgr/scope.hpp>

#include <objects/medline/Medline_entry.hpp>
#include <objtools/edit/struc_comm_field.hpp>
#include <objtools/edit/gb_block_field.hpp>
#include <objects/valid/Comment_rule.hpp>

#include <util/ncbi_cache.hpp>
#include <util/sequtil/sequtil_convert.hpp>
#include <util/sequtil/sequtil_manip.hpp>
#include <util/xregexp/regexp.hpp>
#include <util/strsearch.hpp>

#include "autogenerated_cleanup.hpp"
#include "autogenerated_extended_cleanup.hpp"

BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)

const int CNewCleanup_imp::NCBI_CLEANUP_VERSION = 1;

// We don't want to use CompressSpaces inside the likes of COMPRESS_STRING_MEMBER
// we prefer our own version
#define CompressSpaces x_CompressSpaces

namespace {

    // a CRegexp that has lock and unlock methods,
    // and also inherits from CObject   
    class CRegexpWithLock : public CRegexp, public CObject {
    public:
        CRegexpWithLock( const CTempStringEx & pattern, 
            CRegexp::TCompile flags ) : CRegexp(pattern, flags) { }

        void Lock(void) { m_mutex.Lock(); }
        void Unlock(void) { m_mutex.Unlock(); }

    private:
        CMutex               m_mutex;
    };
    typedef CRef<CRegexpWithLock> TRegexpWithLockRef;

    // this protects its inner object by locking
    // it as soon as it's created and unlocking it when destroyed.
    // this way, there's only one working CLockingRef on the object at a time
    template<typename TLockableObj>
    class CLockingRef {
    public:
        explicit 
        CLockingRef(TLockableObj *pLockableObj) :
        m_pLockableObj(pLockableObj) 
        {
            m_pLockableObj->Lock();
        }

        ~CLockingRef(void) { 
            m_pLockableObj->Unlock();
        }

        TLockableObj * operator->(void) { return m_pLockableObj.GetPointer(); }

    private:
        CRef<TLockableObj> m_pLockableObj;
    };
    typedef CLockingRef<CRegexpWithLock> CCachedRegexp;

    // careful! the key is compared as a *pointer*, NOT via
    // strcmp or anything like that.  For safety, just use
    // string literals.
    typedef pair<const char *, CRegexp::TCompile> TRegexpKey;
    typedef TRegexpWithLockRef TRegexpValue;
    
    class CRegexpCacheHandler : 
        public CCacheElement_Handler<TRegexpKey, TRegexpValue>
    {
    public:
        TRegexpValue CreateValue(const TRegexpKey & regexp_key )
        {
            return Ref(new CRegexpWithLock(
                regexp_key.first, regexp_key.second));
        }
    };
    
    class CRegexpCache {
    public:

        CRegexpCache(void)
            : m_Cache(100) { }

        CCachedRegexp Get( const char * pattern, 
            CRegexp::TCompile flags = CRegexp::fCompile_default )
        {
            TRegexpKey regexpKey(pattern, flags);
            TRegexpWithLockRef regexpLockRef = m_Cache[regexpKey];
            return CCachedRegexp(regexpLockRef.GetPointer());
        }

    private:
        typedef CCache<TRegexpKey, TRegexpValue,
            CRegexpCacheHandler> TUnderlyingCache;
        TUnderlyingCache m_Cache;
    };

    // the actual cache
    CRegexpCache regexpCache;
}

// Constructor
CNewCleanup_imp::CNewCleanup_imp (CRef<CCleanupChange> changes, Uint4 options)
    : m_Changes(changes),
      m_Options(options),
      m_Objmgr(NULL),
      m_Scope(NULL),
      m_IsGpipe(false),
      m_SyncGenCodes(false)
{
    if (options & CCleanup::eClean_GpipeMode) {
        m_IsGpipe = true;
    }

    if (options & CCleanup::eClean_SyncGenCodes) {
        m_SyncGenCodes = true;
    }

    m_Objmgr = CObjectManager::GetInstance ();
    m_Scope.Reset (new CScope (*m_Objmgr));

    // make sure stack is never empty, so "top()" always works
    m_SeqEntryInfoStack.push( SSeqEntryInfo() );
}

// Destructor
CNewCleanup_imp::~CNewCleanup_imp (void)

{
}

// Main methods

void CNewCleanup_imp::BasicCleanupSeqEntry (
    CSeq_entry& se
)

{
    // The class CAutogeneratedCleanup is actually auto-generated code
    // created by datatool from autogenerated_cleanup.txt
    // It traverses into the CSeq_entry object we have here and
    // calls our functions here.
    // The idea is that we don't have to hand-write the
    // error-prone traversal code.
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    auto_cleanup.BasicCleanupSeqEntry( se );
    x_PostProcessing();

    EXPLORE_ALL_BIOSEQS_WITHIN_SEQENTRY (bit, se) {
        CBioseq& bs = *bit;
        SetGeneticCode (bs);
    }
}

void CNewCleanup_imp::BasicCleanupSeqSubmit (
    CSeq_submit& ss
)

{
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    auto_cleanup.BasicCleanupSeqSubmit( ss );
    x_PostProcessing();

    CRef<CSeq_entry> se (ss.SetData().SetEntrys().front());
    if (se.NotEmpty()) {
        EXPLORE_ALL_BIOSEQS_WITHIN_SEQENTRY (bit, *se) {
            CBioseq& bs = *bit;
            SetGeneticCode (bs);
        }
    }
}

void CNewCleanup_imp::BasicCleanupSeqAnnot (
    CSeq_annot& sa
)

{
    // no Seq-entry context, so skip setup function
    
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    auto_cleanup.BasicCleanupSeqAnnot( sa );
    x_PostProcessing();
}

void CNewCleanup_imp::BasicCleanupBioseq (
    CBioseq& bs
)

{
    // no Seq-entry context, so skip setup function
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    auto_cleanup.BasicCleanupBioseq( bs );

    x_PostProcessing();

    SetGeneticCode (bs);
}

void CNewCleanup_imp::BasicCleanupBioseqSet (
    CBioseq_set& bss
)

{
    // no Seq-entry context, so skip setup function
    
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    auto_cleanup.BasicCleanupBioseqSet( bss );
    x_PostProcessing();

    EXPLORE_ALL_BIOSEQS_WITHIN_SEQSET (bit, bss) {
        CBioseq& bs = *bit;
        SetGeneticCode (bs);
    }
}

void CNewCleanup_imp::BasicCleanupSeqFeat (
    CSeq_feat& sf
)

{
    // no Seq-entry context, so skip setup function
    
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    auto_cleanup.BasicCleanupSeqFeat( sf );
    x_PostProcessing();
}


void CNewCleanup_imp::BasicCleanupBioSource (
    CBioSource& src
)

{
    // no Seq-entry context, so skip setup function
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    CRef<CSeq_feat> f(new CSeq_feat());
    f->SetData().SetBiosrc().Assign(src);
    auto_cleanup.BasicCleanupSeqFeat(*f);
    x_PostProcessing();
    src.Assign(f->GetData().GetBiosrc());
}


void CNewCleanup_imp::BasicCleanupSeqEntryHandle (
    CSeq_entry_Handle& seh
)
{
    // clean a copy, and then update via the edit handle

    CRef<CSeq_entry> new_seq_entry( new CSeq_entry );
    new_seq_entry->Assign( *seh.GetCompleteSeq_entry() );

    CSeq_entry_EditHandle edit_handle( seh );

    BasicCleanupSeqEntry( *new_seq_entry );

    edit_handle.SelectNone();
    if( new_seq_entry->IsSeq() ) {
        edit_handle.SelectSeq( new_seq_entry->SetSeq() );
    } else if( new_seq_entry->IsSet() ) {
        edit_handle.SelectSet( new_seq_entry->SetSet() );
    }
}

void CNewCleanup_imp::BasicCleanupBioseqHandle (
    CBioseq_Handle& bsh
)
{
    // clean a copy, and then update via the edit handle

    CRef<CBioseq> new_bioseq( new CBioseq );
    new_bioseq->Assign( *bsh.GetCompleteBioseq() );

    CBioseq_EditHandle edit_handle( bsh );

    BasicCleanupBioseq( *new_bioseq );

    // get each part from the copy

    edit_handle.ResetId();
    FOR_EACH_SEQID_ON_BIOSEQ( seq_id_iter, *new_bioseq ) {
        edit_handle.AddId( CSeq_id_Handle::GetHandle(**seq_id_iter) );
    }

    edit_handle.ResetDescr();
    if( new_bioseq->IsSetDescr() ) {
        edit_handle.SetDescr( new_bioseq->SetDescr() );
    }

    edit_handle.SetInst( new_bioseq->SetInst() );
    while( ! RAW_FIELD_IS_EMPTY_OR_UNSET( *bsh.GetCompleteBioseq(), Annot ) )  {
        CSeq_annot_CI annot_ci( bsh );
        CSeq_annot_EditHandle( *annot_ci ).Remove();
    }
    EDIT_EACH_SEQANNOT_ON_BIOSEQ( annot_iter, *new_bioseq ) {
        edit_handle.AttachAnnot( **annot_iter );
    }
}

void CNewCleanup_imp::BasicCleanupBioseqSetHandle (
    CBioseq_set_Handle& bssh
)
{
    // clean a copy, and then update via the edit handle

    CRef<CBioseq_set> new_bioseq_set( new CBioseq_set );
    new_bioseq_set->Assign( *bssh.GetCompleteBioseq_set() );

    CBioseq_set_EditHandle edit_handle( bssh );

    BasicCleanupBioseqSet( *new_bioseq_set );

    // get each part from the copy

#define BC_COPY_FIELD(Fld) \
    edit_handle.Reset##Fld(); \
    if( new_bioseq_set->IsSet##Fld() ) { \
        edit_handle.Set##Fld( new_bioseq_set->Set##Fld() ); \
    }

    BC_COPY_FIELD(Id);
    BC_COPY_FIELD(Coll);
    BC_COPY_FIELD(Level);
    BC_COPY_FIELD(Class);
    BC_COPY_FIELD(Release);
    BC_COPY_FIELD(Date);
    BC_COPY_FIELD(Descr);

#undef BC_COPY_FIELD

    while( ! RAW_FIELD_IS_EMPTY_OR_UNSET( *bssh.GetCompleteBioseq_set(), Seq_set ) )  {
        CSeq_entry_CI entry_ci( bssh );
        CSeq_entry_EditHandle( *entry_ci ).Remove();
    }
    EDIT_EACH_SEQENTRY_ON_SEQSET( entry_iter, *new_bioseq_set ) {
        edit_handle.AttachEntry( **entry_iter );
    }

    // copy annot field
    while( ! RAW_FIELD_IS_EMPTY_OR_UNSET( *bssh.GetCompleteBioseq_set(), Annot ) )  {
        CSeq_annot_CI annot_ci( bssh );
        CSeq_annot_EditHandle( *annot_ci ).Remove();
    }
    EDIT_EACH_SEQANNOT_ON_SEQSET( annot_iter, *new_bioseq_set ) {
        edit_handle.AttachAnnot( **annot_iter );
    }
}

void CNewCleanup_imp::BasicCleanupSeqAnnotHandle (
    CSeq_annot_Handle& sah
)
{
    // clean a copy, and then update via the edit handle

    CRef<CSeq_annot> new_seq_annot( new CSeq_annot );
    new_seq_annot->Assign( *sah.GetCompleteSeq_annot() );

    CSeq_annot_EditHandle edit_handle( sah );

    BasicCleanupSeqAnnot( *new_seq_annot );

    // Since CSeq_annot_EditHandle doesn't have ".Set[Fld]()" methods or
    // a Replace() method, it's a little more tricky than the others.
    CSeq_entry_EditHandle annot_parent = edit_handle.GetParentEntry();
    if( annot_parent ) {
        edit_handle.Remove();
        sah = annot_parent.AttachAnnot( *new_seq_annot );
    } else {
        // if not part of anything else, a simple swap will do
        CSeq_annot_Handle new_sah = m_Scope->AddSeq_annot( *new_seq_annot );
        edit_handle.Swap( new_sah );
    }
}

void CNewCleanup_imp::BasicCleanupSeqFeatHandle (
    CSeq_feat_Handle& sfh
)
{
    // clean a copy, and then update via the edit handle

    CRef<CSeq_feat> new_seq_feat( new CSeq_feat );
    new_seq_feat->Assign( *sfh.GetOriginalSeq_feat() );

    CSeq_feat_EditHandle edit_handle( sfh );

    BasicCleanupSeqFeat( *new_seq_feat );

    edit_handle.Replace( *new_seq_feat );
}

// Implementation methods

void CNewCleanup_imp::SetGeneticCode (
    CBioseq& bs
)

{
    if ( ! m_SyncGenCodes ) return;

    if ( ! bs.IsNa() ) return;

    int bioseqGenCode = 0;

    CConstRef<CSeqdesc> closest_biosource = bs.GetClosestDescriptor(CSeqdesc::e_Source);
    if (! closest_biosource) return;

    const CBioSource & bsrc = closest_biosource->GetSource();
    
    // use "genome" to determine if we're a plastid or what
    CBioSource::EGenome eGenome = ( bsrc.IsSetGenome() ?
        static_cast<CBioSource::EGenome>(bsrc.GetGenome()) : 
        CBioSource::eGenome_unknown );

    switch( eGenome )
    {
    case CBioSource::eGenome_chloroplast:
    case CBioSource::eGenome_chromoplast:
    case CBioSource::eGenome_plastid:
    case CBioSource::eGenome_cyanelle:
    case CBioSource::eGenome_apicoplast:
    case CBioSource::eGenome_leucoplast:
    case CBioSource::eGenome_proplastid:
    case CBioSource::eGenome_chromatophore: {
        // plastid
        const int iPlastidCode = (
            FIELD_CHAIN_OF_2_IS_SET(bsrc, Org, Pgcode) ?
            bsrc.GetOrg().GetPgcode() : 0 );
        bioseqGenCode = ( iPlastidCode > 0 ? iPlastidCode : 11 );
        break;
    }
    case CBioSource::eGenome_kinetoplast:
    case CBioSource::eGenome_mitochondrion:
    case CBioSource::eGenome_hydrogenosome:
        bioseqGenCode = ( 
                FIELD_CHAIN_OF_2_IS_SET(bsrc, Org, Mgcode) ?
                bsrc.GetOrg().GetMgcode() : 0 );
        break;
    default:
        // usually we want the nuc code
        bioseqGenCode = ( 
                FIELD_CHAIN_OF_2_IS_SET(bsrc, Org, Gcode) ?
                bsrc.GetOrg().GetGcode() : 0 );
        break;
    }

    if ( bioseqGenCode == 0 ) return;

    // set Cdregion's gcode from BioSource (unless except-text)

    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(bs);
    if ( ! bsh ) return;
    SAnnotSelector sel( CSeqFeatData::e_Cdregion );
    CFeat_CI feat_ci( bsh, sel );
    for( ; feat_ci ; ++feat_ci ) {
        const CSeq_feat& feat = feat_ci->GetOriginalFeature();
        const CCdregion& cds = feat.GetData().GetCdregion();
        int cdregionGenCode = ( cds.IsSetCode() ?
            cds.GetCode().GetId() :
            0 );
        if( cdregionGenCode != bioseqGenCode )
        {
            // make cdregion's gencode match bioseq's gencode,
            // if allowed
            if( ! feat.HasExceptionText("genetic code exception") )
            {
                CRef<CSeq_feat> new_feat(new CSeq_feat);
                new_feat->Assign(feat);
                CCdregion& new_cds = new_feat->SetData().SetCdregion();
                new_cds.ResetCode();
                new_cds.SetCode().SetId(bioseqGenCode);
                CSeq_feat_EditHandle edit_handle(*feat_ci);
                edit_handle.Replace(*new_feat);
                ChangeMade(CCleanupChange::eChangeGeneticCode);
            }
        }
    }
}

void CNewCleanup_imp::ChangeMade (CCleanupChange::EChanges e)
{
    if (m_Changes) {
        m_Changes->SetChanged (e);
    }
}

void CNewCleanup_imp::EnteringEntry (
    CSeq_entry& se
)

{
    SSeqEntryInfo seqEntryInfo;
    if( ! m_SeqEntryInfoStack.empty() ) {
        // inherit from parent by default
        seqEntryInfo = m_SeqEntryInfoStack.top();
    }
    seqEntryInfo.m_IsEmblOrDdbj = false;
    seqEntryInfo.m_StripSerial = true;

    // for cleanup Seq-entry and Seq-submit, set scope and parentize.
    // We use exceptions for AddTopLevelSeqEntry because we need to detect
    // if we've already processed the given Seq-entry.
    {{
         CSeq_entry_Handle seh =
             m_Scope->GetSeq_entryHandle(se, CScope::eMissing_Null);
         if (seh) {
             // all code paths in this function must result
             // in m_SeqEntryInfoStack getting a "push"
             m_SeqEntryInfoStack.push( m_SeqEntryInfoStack.top() );
             return;
         }

         m_Scope->AddTopLevelSeqEntry (se);
         se.Parentize();
     }}

    // a few differences based on sequence identifier type
    // (some values are reset here because they shouldn't inherit
    // from higher seq-entry's)
    VISIT_ALL_BIOSEQS_WITHIN_SEQENTRY (bs_itr, se) {
        const CBioseq& bs = *bs_itr;
        FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, bs) {
            const CSeq_id& sid = **sid_itr;
            SWITCH_ON_SEQID_CHOICE (sid) {
                case NCBI_SEQID(Genbank):
                case NCBI_SEQID(Tpg):
                    {
                        const CTextseq_id& tsid = *GET_FIELD (sid, Textseq_Id);
                        if (FIELD_IS_SET (tsid, Accession)) {
                            const string& acc = GET_FIELD (tsid, Accession);
                            if (acc.length() == 6) {
                                seqEntryInfo.m_StripSerial = false;
                            }
                        }
                    }
                    break;
                case NCBI_SEQID(Embl):
                case NCBI_SEQID(Ddbj):
                    seqEntryInfo.m_StripSerial = false;
                    seqEntryInfo.m_IsEmblOrDdbj = true;
                    break;
                case NCBI_SEQID(not_set):
                case NCBI_SEQID(Local):
                case NCBI_SEQID(Other):
                case NCBI_SEQID(General):
                    break;
                case NCBI_SEQID(Gibbsq):
                case NCBI_SEQID(Gibbmt):
                case NCBI_SEQID(Pir): 
                case NCBI_SEQID(Swissprot):
                case NCBI_SEQID(Patent):
                case NCBI_SEQID(Prf):
                case NCBI_SEQID(Pdb):
                case NCBI_SEQID(Gpipe):
                case NCBI_SEQID(Tpe):
                case NCBI_SEQID(Tpd):
                    seqEntryInfo.m_StripSerial = false;
                    break;
                default:
                    break;
            }
        }
    }

    m_SeqEntryInfoStack.push(seqEntryInfo);
}

void CNewCleanup_imp::LeavingEntry (
    CSeq_entry& se
)

{
    m_SeqEntryInfoStack.pop();
}

// Strips all spaces in string in following manner. If the function
// meets several spaces (spaces and tabs) in succession it replaces them
// with one space. Strips all spaces after '(' and before ( ')' or ',' ).
void CNewCleanup_imp::x_StripSpacesMarkChanged(string& str)
{
    if (str.empty()) {
        return;
    }

    const string::size_type old_size = str.length();

    string::iterator end = str.end();
    string::iterator it = str.begin();
    string::iterator new_str = it;
    while (it != end) {
        *new_str++ = *it;
        if ( (*it == ' ')  ||  (*it == '\t')  ||  (*it == '(') ) {
            for (++it; (it != end) && (*it == ' ' || *it == '\t'); ++it) continue;
            if ((it != end) && (*it == ')' || *it == ',') ) {
                // this "if" protects against the case "(...bunch of spaces and tabs...)".
                // Otherwise, the first '(' is unintentionally erased
                if( *(new_str - 1) != '(' ) { 
                    --new_str;
                }
            }
        } else {
            ++it;
        }
    }
    str.erase(new_str, str.end());

    if( str.length() != old_size ) {
        ChangeMade(CCleanupChange::eTrimSpaces);
    }
}

void CNewCleanup_imp::x_RemoveSpacesBetweenTildesMarkChanged( std::string & str )
{
    if( RemoveSpacesBetweenTildes(str) ) {
        ChangeMade(CCleanupChange::eTrimSpaces);
    }
}

void CNewCleanup_imp::x_TruncateSpacesMarkChanged( std::string & str )
{
    const size_t old_str_size = str.length();
    NStr::TruncateSpacesInPlace(str);
    if( old_str_size != str.length() ) {
        ChangeMade(CCleanupChange::eTrimSpaces);
    }
}

void CNewCleanup_imp::SeqsetBC (
    CBioseq_set& bss
)

{
    if( ! FIELD_IS_SET(bss, Class) || 
        GET_FIELD(bss, Class) == CBioseq_set::eClass_not_set || 
        GET_FIELD(bss, Class) == CBioseq_set::eClass_other ) 
    { 
        int num_nucs = 0;
        int num_prots = 0;
        bool make_genbank = false;
        CBioseq_set_Handle handle = m_Scope->GetBioseq_setHandle( bss );
        if( handle ) {
            CBioseq_CI bioseq_it( handle, CSeq_inst::eMol_not_set, CBioseq_CI::eLevel_Mains );
            for( ; bioseq_it ; ++bioseq_it ) {
                if( bioseq_it->IsAa() ) {
                    ++num_prots;
                } else if( bioseq_it->IsNa() ) {
                    ++num_nucs;
                }
            }

            // Iterate descendent Bioseq_set's.
            // Since there seems to be no such thing as CBioseq_set_CI,
            // we iterate over the Seq-entry's since every Bioseq-set should
            // be guaranteed to be in a Seq-entry.
            CSeq_entry_CI seq_entry_ci( handle );
            for( ; seq_entry_ci; ++seq_entry_ci ) {
                if( seq_entry_ci->IsSet() ) {
                    CBioseq_set_Handle bioseq_set = seq_entry_ci->GetSet();
                    if( ! FIELD_EQUALS(bioseq_set, Class, NCBI_BIOSEQSETCLASS(segset)) && 
                        ! FIELD_EQUALS(bioseq_set, Class, NCBI_BIOSEQSETCLASS(parts)) ) 
                    {
                        make_genbank = true;
                    }
                }
            }
            // separate check needed for top level due to the somewhat kludgy way
            // we iterate over CBioseq-sets
            if( ! FIELD_EQUALS(handle, Class, NCBI_BIOSEQSETCLASS(segset)) && 
                ! FIELD_EQUALS(handle, Class, NCBI_BIOSEQSETCLASS(parts)) ) 
            {
                make_genbank = true;
            }
        }

        if( (num_nucs == 1) && (num_prots > 0) && ! make_genbank ) {
            bss.SetClass( CBioseq_set::eClass_nuc_prot );
            ChangeMade(CCleanupChange::eChangeBioseqSetClass);
        } else {
            bss.SetClass( CBioseq_set::eClass_genbank );
            ChangeMade(CCleanupChange::eChangeBioseqSetClass);
        }
    }
}

void CNewCleanup_imp::SeqIdBC( CSeq_id &seq_id )
{
    // try to find CObject_id in Seq-id for certain types
    CRef<CObject_id> pObjectId;
    if( seq_id.IsLocal() ) {
        pObjectId.Reset( & GET_MUTABLE(seq_id, Local) );
    }

    // currently, we only process the Str ones
    if( ! pObjectId || ! FIELD_IS(*pObjectId, Str) ) {
        return;
    }

    x_TruncateSpacesMarkChanged( GET_MUTABLE(*pObjectId, Str) );
}

typedef SStaticPair<const char*, TORGMOD_SUBTYPE>  TOrgModElem;
static const TOrgModElem sc_orgmod_map[] = {
    { "Acronym",            NCBI_ORGMOD(acronym) },
    { "Anamorph",           NCBI_ORGMOD(anamorph) },
    { "Authority",          NCBI_ORGMOD(authority) },
    { "Bio-material",       NCBI_ORGMOD(bio_material) },
    { "Biotype",            NCBI_ORGMOD(biotype) },
    { "Biovar",             NCBI_ORGMOD(biovar) },
    { "Breed",              NCBI_ORGMOD(breed) },
    { "Chemovar",           NCBI_ORGMOD(chemovar) },
    { "Common",             NCBI_ORGMOD(common) },
    { "Cultivar",           NCBI_ORGMOD(cultivar) },
    { "Culture-collection", NCBI_ORGMOD(culture_collection)  },
    { "Ecotype",            NCBI_ORGMOD(ecotype) },
    { "Forma",              NCBI_ORGMOD(forma) },
    { "Forma-specialis",    NCBI_ORGMOD(forma_specialis) },
    { "Group",              NCBI_ORGMOD(group) },
    { "Host",               NCBI_ORGMOD(nat_host) },
    { "Isolate",            NCBI_ORGMOD(isolate) },
    { "Metagenome-source",  NCBI_ORGMOD(metagenome_source) },
    { "Pathovar",           NCBI_ORGMOD(pathovar) },
    { "Serogroup",          NCBI_ORGMOD(serogroup) },
    { "Serotype",           NCBI_ORGMOD(serotype) },
    { "Serovar",            NCBI_ORGMOD(serovar) },
    { "Specimen-voucher",   NCBI_ORGMOD(specimen_voucher) },
    { "Strain",             NCBI_ORGMOD(strain) },
    { "Sub-species",        NCBI_ORGMOD(sub_species) },
    { "Subgroup",           NCBI_ORGMOD(subgroup) },
    { "Substrain",          NCBI_ORGMOD(substrain) },
    { "Subtype",            NCBI_ORGMOD(subtype) },
    { "Synonym",            NCBI_ORGMOD(synonym) },
    { "Teleomorph",         NCBI_ORGMOD(teleomorph) },
    { "Type",               NCBI_ORGMOD(type) },
    { "Variety",            NCBI_ORGMOD(variety) }
};
typedef CStaticArrayMap<string, TORGMOD_SUBTYPE, PNocase> TOrgModMap;
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TOrgModMap, sc_OrgModMap, sc_orgmod_map);

static const TOrgModElem sc_orgmodalias_map[] = {
    { "nat-host",      NCBI_ORGMOD(nat_host) },
    { "specific-host", NCBI_ORGMOD(nat_host) },
    { "sub-strain",     NCBI_ORGMOD(substrain) },
    { "subspecies",    NCBI_ORGMOD(sub_species) }
};
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TOrgModMap, sc_OrgModAliasMap, sc_orgmodalias_map);

typedef SStaticPair<const char*, TORGMOD_SUBTYPE>  TSubsourceElem;
static const TSubsourceElem sc_subsource_map[] = {
    { "Cell-line",             NCBI_SUBSOURCE(cell_line) },
    { "Cell-type",             NCBI_SUBSOURCE(cell_type) },
    { "Chromosome",            NCBI_SUBSOURCE(chromosome) },
    { "Clone",                 NCBI_SUBSOURCE(clone) },
    { "Clone-lib",             NCBI_SUBSOURCE(clone_lib) },
    { "Collected-by",          NCBI_SUBSOURCE(collected_by) },
    { "Collection-date",       NCBI_SUBSOURCE(collection_date) },
    { "Country",               NCBI_SUBSOURCE(country) },
    { "Dev-stage",             NCBI_SUBSOURCE(dev_stage) },
    { "Endogenous-virus-name", NCBI_SUBSOURCE(endogenous_virus_name) },
    { "Environmental-sample",  NCBI_SUBSOURCE(environmental_sample) },
    { "Frequency",             NCBI_SUBSOURCE(frequency) },
    { "Genotype",              NCBI_SUBSOURCE(genotype) },
    { "Germline",              NCBI_SUBSOURCE(germline) },
    { "Haplogroup",            NCBI_SUBSOURCE(haplogroup) },
    { "Haplotype",             NCBI_SUBSOURCE(haplotype) },
    { "Identified-by",         NCBI_SUBSOURCE(identified_by) },
    { "Isolation-source",      NCBI_SUBSOURCE(isolation_source) },
    { "Lab-host",              NCBI_SUBSOURCE(lab_host) },
    { "Lat-Lon",               NCBI_SUBSOURCE(lat_lon) },
    { "Linkage-group",         NCBI_SUBSOURCE(linkage_group) },
    { "Map",                   NCBI_SUBSOURCE(map) },
    { "Mating-type",           NCBI_SUBSOURCE(mating_type) },
    { "Metagenomic",           NCBI_SUBSOURCE(metagenomic) },
    { "Plasmid-name",          NCBI_SUBSOURCE(plasmid_name) },
    { "Pop-variant",           NCBI_SUBSOURCE(pop_variant) },
    { "Rearranged",            NCBI_SUBSOURCE(rearranged) },
    { "Segment",               NCBI_SUBSOURCE(segment) },
    { "Sex",                   NCBI_SUBSOURCE(sex) },
    { "Subclone",              NCBI_SUBSOURCE(subclone) },
    { "Tissue-lib",            NCBI_SUBSOURCE(tissue_lib) },
    { "Tissue-type",           NCBI_SUBSOURCE(tissue_type) },
    { "Transgenic",            NCBI_SUBSOURCE(transgenic) }
};
typedef CStaticArrayMap<string, TSUBSOURCE_SUBTYPE, PNocase> TSubsourceMap;
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TSubsourceMap, sc_SubsourceMap, sc_subsource_map);

static const TSubsourceElem sc_subsourcealias_map[] = {
    { "fwd-primer-name",    NCBI_SUBSOURCE(fwd_primer_name) },
    { "fwd-primer-seq",     NCBI_SUBSOURCE(fwd_primer_seq) },
    { "Lat-long",           NCBI_SUBSOURCE(lat_lon) },
    { "Latitude-Longitude", NCBI_SUBSOURCE(lat_lon) },
    { "rev-primer-name",    NCBI_SUBSOURCE(rev_primer_name) },
    { "rev-primer-seq",     NCBI_SUBSOURCE(rev_primer_seq) },
    { "sub-clone",          NCBI_SUBSOURCE(subclone) }
};
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TSubsourceMap, sc_SubsourceAliasMap, sc_subsourcealias_map);

// change the target string by searching for the given search_pattern
// and replacing it with replacement up to max_replace times (0 means unlimited)
//
// Example: 
//     string foo = "Test:   FOO   BAR    :BAZ."
//     s_RegexpReplace( foo, ":[ ]+", ": " );
// This turns foo into "Test: FOO   BAR    :BAZ."
// Returns "true" if a replacement was done

static const int s_RegexpReplace_UnlimitedReplacements = 0;

static
bool s_RegexpReplace( string &target, 
    const char *search_pattern, 
    const char *replacement,
    int max_replace = s_RegexpReplace_UnlimitedReplacements,
    CRegexp::ECompile compile_flags = CRegexp::fCompile_default )
{
    CRegexpUtil replacer( target );
    int num_replacements = replacer.Replace( search_pattern, replacement, 
        compile_flags, CRegexp::fMatch_default, max_replace );
    // swap is faster than assignment
    replacer.GetResult().swap( target ); 

    return ( num_replacements > 0 );
}

// This is similar to lexicographical_compare_3way,
// but we have to implement it ourselves because
// it's an SGI extension, not in the standard.
template <class Iter1, class Iter2, class Compare>
static int ncbi_lexicographical_compare_3way( 
    Iter1 first1, Iter1 last1, 
    Iter2 first2, Iter2 last2, 
    Compare compare )
{
    for( ; first1 != last1 && first2 != last2 ; ++first1, ++first2 ) {
        int comparison = compare( *first1, *first2 );
        if( comparison != 0 ) {
            return comparison;
        }
    }

    if( first1 == last1 ) {
        if( first2 == last2 ) {
            return 0; // they're equal
        } else {
            // second is longer
            return -1;
        }
    } else {
        // first is longer
        return 1;
    }
}

class PNocase_EqualChar
{
public:
    bool operator()( const char ch1, const char ch2 ) const {
        return toupper(ch1) == toupper(ch2);
    }
};

class PNocase_LessChar
{
public:
    bool operator()( const char ch1, const char ch2 ) const {
        return toupper(ch1) < toupper(ch2);
    }
};

class PNocase_CompareChar
{
public:
    int operator()( const char ch1, const char ch2 ) const {
        return ( (int)toupper(ch1) - (int)toupper(ch2) );
    }
};

// C compares using toupper, as opposed to the built-in
// stuff which seems to use tolower, thus producing
// some differences in sorting order in some places.
// Once we've fully moved away from C there's probably
// no harm in replacing all calls to s_CompareNoCaseCStyle with
// normal functions like NStr::CompareNocase()
static
int s_CompareNoCaseCStyle( const string &s1, const string &s2 ) 
{
    return ncbi_lexicographical_compare_3way(
            s1.begin(), s1.end(), 
            s2.begin(), s2.end(), 
            PNocase_CompareChar() );
}

static
const string &s_GenomeToPlastidName( const CBioSource& biosrc )
{
    SWITCH_ON_BIOSOURCE_GENOME (biosrc) {
    case NCBI_GENOME(apicoplast): 
        {
            const static string apicoplast("apicoplast");
            return apicoplast;
        }
        break;
    case NCBI_GENOME(chloroplast):
        {
            const static string chloroplast("chloroplast");
            return chloroplast;
        }
        break;
    case NCBI_GENOME(chromoplast):
        {
            const static string chromoplast("chromoplast");
            return chromoplast;
        }
        break;
    case NCBI_GENOME(kinetoplast):
        {
            const static string kinetoplast("kinetoplast");
            return kinetoplast;
        }
        break;
    case NCBI_GENOME(leucoplast):
        {
            const static string leucoplast("leucoplast");
            return leucoplast;
        }
        break;
    case NCBI_GENOME(plastid):
        {
            const static string plastid("plastid");
            return plastid;
        }
        break;
    case NCBI_GENOME(proplastid):
        {
            const static string proplastid("proplastid");
            return proplastid;
        }
        break;
    default:
        return kEmptyStr;
        break;
    }
    return kEmptyStr;
}

// If str starts with prefix, the prefix is removed from the string.
static
bool s_RemoveInitial( string &str, const string &prefix, NStr::ECase case_to_use )
{
    if( NStr::StartsWith( str, prefix, case_to_use ) ) {
        str.erase( 0, prefix.length() );
        return true;
    }
    return false;
}

// Given the position of the opening paren in a string, this returns
// the position of the closing paren (keeping track of any nested parens
// in the middle.
// It returns NPOS if the paren is not closed.
// This function is not currently smart; it doesn't know about quotes
// or anything
static
SIZE_TYPE s_MatchingParenPos( const string &str, SIZE_TYPE open_paren_pos )
{
    _ASSERT( str[open_paren_pos] == '(' );
    _ASSERT( open_paren_pos < str.length() );

    // nesting level. start at 1 since we know there's an open paren
    int level = 1;

    SIZE_TYPE pos = open_paren_pos + 1;
    for( ; pos < str.length(); ++pos ) {
        switch( str[pos] ) {
            case '(':
                // nesting deeper
                ++level;
                break;
            case ')':
                // closed a level of nesting
                --level;
                if( 0 == level ) {
                    // reached the top: we're closing the initial paren,
                    // so we return our position
                    return pos;
                }
                break;
            default:
                // ignore other characters.
                // maybe in the future we'll handle ignoring parens in quotes or
                // things like that.
                break;
        }
    }
    return NPOS;
}

static bool s_AccessionCompare (
    const string& str1,
    const string& str2
)

{
    return ( NStr::CompareNocase( str1, str2 ) < 0 );
}

static bool s_AccessionEqual (
    const string& str1,
    const string& str2
)

{
    if (NStr::EqualNocase (str1, str2)) return true;

    return false;
}

void CNewCleanup_imp::GBblockBC (
    CGB_block& gbk
)

{
    CLEAN_STRING_LIST (gbk, Extra_accessions);

    if (! EXTRAACCN_ON_GENBANKBLOCK_IS_SORTED (gbk, s_AccessionCompare)) {
        SORT_EXTRAACCN_ON_GENBANKBLOCK (gbk, s_AccessionCompare);
        ChangeMade (CCleanupChange::eCleanQualifiers);
    }

    if (! EXTRAACCN_ON_GENBANKBLOCK_IS_UNIQUE (gbk, s_AccessionEqual)) {
        UNIQUE_EXTRAACCN_ON_GENBANKBLOCK (gbk, s_AccessionEqual);
        ChangeMade (CCleanupChange::eCleanQualifiers);
    }

    CLEAN_STRING_LIST (gbk, Keywords);

    CCachedRegexp reassembly_regex = regexpCache.Get(
        "^tpa[:_]reassembly$", 
        CRegexp::fCompile_ignore_case );
    EDIT_EACH_KEYWORD_ON_EMBLBLOCK(keyword_it, gbk) {
        string & sKeyword = *keyword_it;
        if( reassembly_regex->IsMatch(sKeyword) ) {
            // remove the "re" in "reassembly"
            sKeyword.erase(4, 2);
            ChangeMade (CCleanupChange::eCleanQualifiers);
        }
    }

    if( m_SeqEntryInfoStack.top().m_IsEmblOrDdbj ) {
        UNIQUE_WITHOUT_SORT_KEYWORD_ON_GENBANKBLOCK( gbk, PCase );
    } else {
        UNIQUE_WITHOUT_SORT_KEYWORD_ON_GENBANKBLOCK( gbk, PNocase );
    }

    CLEAN_STRING_MEMBER_JUNK (gbk, Source);
    if( FIELD_EQUALS(gbk, Source, ".") ) {
        RESET_FIELD(gbk, Source);
        ChangeMade(CCleanupChange::eRemoveQualifier);
    }
    CLEAN_STRING_MEMBER_JUNK (gbk, Origin);
    if( FIELD_EQUALS(gbk, Origin, ".") ) {
        RESET_FIELD(gbk, Origin);
        ChangeMade(CCleanupChange::eRemoveQualifier);
    }

    CLEAN_STRING_MEMBER (gbk, Date);
    CLEAN_STRING_MEMBER (gbk, Div);
    CLEAN_STRING_MEMBER (gbk, Taxonomy);
}

void CNewCleanup_imp::EMBLblockBC (
    CEMBL_block& emb
)

{
    CLEAN_STRING_LIST (emb, Extra_acc);

    if (! EXTRAACCN_ON_EMBLBLOCK_IS_SORTED (emb, s_AccessionCompare)) {
        SORT_EXTRAACCN_ON_EMBLBLOCK (emb, s_AccessionCompare);
        ChangeMade (CCleanupChange::eCleanQualifiers);
    }

    if (! EXTRAACCN_ON_EMBLBLOCK_IS_UNIQUE (emb, s_AccessionEqual)) {
        UNIQUE_EXTRAACCN_ON_EMBLBLOCK (emb, s_AccessionEqual);
        ChangeMade (CCleanupChange::eCleanQualifiers);
    }

    CLEAN_STRING_LIST (emb, Keywords);

    UNIQUE_WITHOUT_SORT_KEYWORD_ON_EMBLBLOCK (emb, PCase);
}


// Give it a map that maps case-insensitive string to some other type, 
// and it will return any matches that are a prefix for str.
// For example, if you have a mapping that includes ("foo" to 7), then passing
// str as "Foo something", will return the ("foo" to 7) mapping.
template< typename TMapType >
typename TMapType::const_iterator s_FindInMapAsPrefix( const string &str_arg, const TMapType &the_map )
{
    // holds the str we're looking at, which might be str_arg, or
    // might be another string constructed from it
    const string *str = &str_arg;

    // use this to delete strings created in this function, if any.
    // we don't read from it directly
    auto_ptr<string> temp_str;

    // chop off characters that can't be in the map, so they don't count
    SIZE_TYPE first_bad_char = 0;
    for( ; first_bad_char < str_arg.length(); ++first_bad_char ) {
        const char ch = str_arg[first_bad_char];
        if( ! isalnum(ch) && ch != '-' && ch != '_' && ch != ' ' ) {
            temp_str.reset( new string(str_arg, 0, first_bad_char) );
            str = temp_str.get();
            break;
        }
    }

    typename TMapType::const_iterator it = the_map.lower_bound( *str );
    if( it != the_map.begin() && ( it == the_map.end() || ! NStr::EqualNocase(*str, it->first) ) ) {
        --it;
    }
    if ( it != the_map.end() && NStr::StartsWith(*str, it->first, NStr::eNocase)) {
        return it;
    }
    return the_map.end();
}

// s_FindInMapAsPrefix, but for data structures like sets.
template< typename TSetType >
typename TSetType::const_iterator s_FindInSetAsPrefix( const string &str, const TSetType &the_set )
{
    typename TSetType::const_iterator it = the_set.lower_bound( str );
    if( it != the_set.begin() && ( it == the_set.end() || ! NStr::EqualNocase(str, *it) ) ) {
        --it;
    }
    if ( it != the_set.end() && NStr::StartsWith(str, *it, NStr::eNocase)) {
        return it;
    }
    return the_set.end();
}

// copy "str" because we're changing it anyway
// returns true if we found anything
static
bool s_StringHasOrgModPrefix(const string &str, string::size_type &out_val_start_pos, TORGMOD_SUBTYPE &out_subtype)
{
    bool found_something = false;

    TOrgModMap::const_iterator orgmod_it = s_FindInMapAsPrefix<TOrgModMap>( str, sc_OrgModMap );
    if( orgmod_it != sc_OrgModMap.end() && orgmod_it->second != NCBI_ORGMOD(nat_host) ) {
        out_val_start_pos = orgmod_it->first.length();
        out_subtype = orgmod_it->second;
        found_something = true;
    } else {
        TOrgModMap::const_iterator orgmodalias_it = s_FindInMapAsPrefix<TOrgModMap>( str, sc_OrgModAliasMap );
        if( orgmodalias_it != sc_OrgModAliasMap.end() && orgmodalias_it->second != NCBI_ORGMOD(nat_host) ) {
            out_val_start_pos = orgmodalias_it->first.length();
            out_subtype = orgmodalias_it->second;
            found_something = true;
        }
    }

    if( (! found_something) && ( str.find_first_of("-") != string::npos ) ) { 
        string new_str = str;
        NStr::ReplaceInPlace( new_str, "-", "_" );
        return s_StringHasOrgModPrefix( new_str, out_val_start_pos, out_subtype );
    }

    if( found_something ) {
        // move out_val_start_pos to where the val begins, since we're probably on an equal sign or something
        out_val_start_pos = str.find_first_not_of("=: ", out_val_start_pos);
        if( string::npos == out_val_start_pos ) {
            out_val_start_pos = str.length();
        }
    }

    return found_something;
}

// returns true if we found anything
static
bool s_StringHasSubSourcePrefix(const string &str, string::size_type &out_val_start_pos, TSUBSOURCE_SUBTYPE &out_subtype)
{
    bool found_something = false;

    // We check alias before regular because regular contains "Lat-Lon" which is a strict prefix
    // of "Lat-Long" in alias.
    TOrgModMap::const_iterator subsrcalias_it = s_FindInMapAsPrefix<TSubsourceMap>( str, sc_SubsourceAliasMap );
    if( subsrcalias_it != sc_SubsourceAliasMap.end() ) {
        out_val_start_pos = subsrcalias_it->first.length();
        out_subtype = subsrcalias_it->second;
        found_something = true;
    } else {
        TOrgModMap::const_iterator subsrc_it = s_FindInMapAsPrefix<TSubsourceMap>( str, sc_SubsourceMap );
        if( subsrc_it != sc_SubsourceMap.end() ) {
            out_val_start_pos = subsrc_it->first.length();
            out_subtype = subsrc_it->second;
            found_something = true;
        }
    }

    if( (! found_something) && ( str.find_first_of("-") != string::npos ) ) { 
        string new_str = str;
        NStr::ReplaceInPlace( new_str, "-", "_" );
        return s_StringHasSubSourcePrefix( new_str, out_val_start_pos, out_subtype );
    }

    if( found_something ) {
        // move out_val_start_pos to where the val begins, since we're probably on an equal sign or something
        out_val_start_pos = str.find_first_not_of("=: ", out_val_start_pos);
        if( string::npos == out_val_start_pos ) {
            out_val_start_pos = str.length();
        }
    }

    return found_something;
}

static CSubSource* s_StringToSubSource (
    const string& str
)

{
    string::size_type val_start_pos;
    TSUBSOURCE_SUBTYPE subtype = -1;
    if( ! s_StringHasSubSourcePrefix( str, val_start_pos, subtype ) ) {
        return NULL;
    }

    // we should have split on something non-alphanumeric
    // (equals sign, colon, or whatever)
    if( (val_start_pos < 1) || 
        ( val_start_pos < str.length() && isalnum( str[val_start_pos-1] ) ) ) {
        return NULL;
    }

    CSubSource *result = new CSubSource;
    result->SetSubtype( subtype );
    result->SetName( str.substr(val_start_pos) );

    return result;
}

// is st1 < st2

static bool s_SubsourceCompare (
    const CRef<CSubSource>& st1,
    const CRef<CSubSource>& st2
)

{
    const CSubSource& sbs1 = *(st1);
    const CSubSource& sbs2 = *(st2);

    TSUBSOURCE_SUBTYPE chs1 = GET_FIELD (sbs1, Subtype);
    TSUBSOURCE_SUBTYPE chs2 = GET_FIELD (sbs2, Subtype);

    if (chs1 < chs2) return true;
    if (chs1 > chs2) return false;

    if (FIELD_IS_SET (sbs2, Name)) {
        if (! FIELD_IS_SET (sbs1, Name)) return true;
        if (s_CompareNoCaseCStyle(GET_FIELD (sbs1, Name), GET_FIELD (sbs2, Name)) < 0) return true;
    }

    return false;
}

// Two SubSource's are equal and duplicates if:
// they have the same subtype
// and the same name (or don't require a name).

static bool s_SubsourceEqual (
    const CRef<CSubSource>& st1,
    const CRef<CSubSource>& st2
)

{
    const CSubSource& sbs1 = *(st1);
    const CSubSource& sbs2 = *(st2);

    TSUBSOURCE_SUBTYPE chs1 = GET_FIELD (sbs1, Subtype);
    TSUBSOURCE_SUBTYPE chs2 = GET_FIELD (sbs2, Subtype);

    if (chs1 != chs2) return false;
    if (CSubSource::NeedsNoText (chs2)) return true;

    if (FIELD_IS_SET (sbs1, Name) && FIELD_IS_SET (sbs2, Name)) {
        if (NStr::EqualNocase (GET_FIELD (sbs1, Name), GET_FIELD (sbs2, Name))) return true;
    }
    if (! FIELD_IS_SET (sbs1, Name) && ! FIELD_IS_SET (sbs2, Name)) return true;

    return false;
}

void CNewCleanup_imp::BiosourceFeatBC (
    CBioSource& biosrc,
    CSeq_feat & seqfeat
)
{
    // consolidate all orgmods of subtype "other" into one
    CRef<COrgMod> pFirstOtherOrgMod;
    EDIT_EACH_ORGMOD_ON_BIOSOURCE(orgmod_it, biosrc) {
        COrgMod & orgmod = **orgmod_it;

        // we're only cleaning the ones of type "other"
        if( ! FIELD_EQUALS(orgmod, Subtype, NCBI_ORGMOD(other)) ||
            ! FIELD_IS_SET(orgmod, Subname) ) 
        {
            continue;
        }

        if( pFirstOtherOrgMod ) {
            STRING_FIELD_APPEND(*pFirstOtherOrgMod, Subname, "; ", GET_STRING_FLD_OR_BLANK(orgmod, Subname) );
            ChangeMade(CCleanupChange::eChangeOrgmod);
            ERASE_ORGMOD_ON_BIOSOURCE(orgmod_it, biosrc);
            ChangeMade(CCleanupChange::eRemoveOrgmod);
        } else {
            pFirstOtherOrgMod.Reset( &orgmod );
        }
    }

    // consolidate all subsources of subtype "other" into one
    CRef<CSubSource> pFirstOtherSubSource;
    EDIT_EACH_SUBSOURCE_ON_BIOSOURCE( subsrc_iter, biosrc ) {
        CSubSource &subsrc = **subsrc_iter;

        // we're only cleaning the ones of type "other"
        if( ! FIELD_EQUALS(subsrc, Subtype, NCBI_SUBSOURCE(other) ) ||
            ! FIELD_IS_SET(subsrc, Name) ) 
        {
            continue;
        }

        if( pFirstOtherSubSource ) {
            STRING_FIELD_APPEND(*pFirstOtherSubSource, Name, "; ", GET_STRING_FLD_OR_BLANK(subsrc, Name) );
            ChangeMade(CCleanupChange::eChangeSubsource);
            ERASE_SUBSOURCE_ON_BIOSOURCE(subsrc_iter, biosrc);
            ChangeMade(CCleanupChange::eRemoveSubSource);
        } else {
            pFirstOtherSubSource.Reset( &subsrc );
        }
    }

    // transfer feat comment (if any) to the end of the last other subsource note
    if( FIELD_IS_SET(seqfeat, Comment) ) {

        if( ! pFirstOtherSubSource ) {
            // create an empty subsource note if none found
            pFirstOtherSubSource.Reset( new CSubSource );
            SET_FIELD(*pFirstOtherSubSource, Subtype, NCBI_SUBSOURCE(other) );
            ADD_SUBSOURCE_TO_BIOSOURCE(biosrc, pFirstOtherSubSource);
        }

        STRING_FIELD_APPEND(*pFirstOtherSubSource, Name, "; ", GET_FIELD(seqfeat, Comment));
        ChangeMade ( CCleanupChange::eChangeSubsource );
        RESET_FIELD(seqfeat, Comment);
        ChangeMade ( CCleanupChange::eChangeComment );
    }
}

void CNewCleanup_imp::BiosourceBC (
    CBioSource& biosrc
)
{
    if( FIELD_EQUALS( biosrc, Genome, CBioSource::eGenome_virion ) ) {
        RESET_FIELD( biosrc, Genome );
        ChangeMade ( CCleanupChange::eChangeBioSourceGenome );
    }

    if( FIELD_EQUALS( biosrc, Origin, NCBI_ORIGIN(unknown) ) ) {
        RESET_FIELD(biosrc, Origin);
        ChangeMade ( CCleanupChange::eChangeBioSourceOrigin );
    }


    // remove spaces and convert to lowercase in fwd_primer_seq and rev_primer_seq.
    if( FIELD_IS_SET(biosrc, Subtype) ) {
        SUBSOURCE_ON_BIOSOURCE_Type::iterator prev = 
            SUBSOURCE_ON_BIOSOURCE_Set(biosrc).end();
        EDIT_EACH_SUBSOURCE_ON_BIOSOURCE (it, biosrc) {
            CSubSource& sbs = **it;

            TSUBSOURCE_SUBTYPE chs = GET_FIELD (sbs, Subtype);
            if (CSubSource::NeedsNoText (chs)) {
                // name is required - set it to empty string
                if( ! FIELD_IS_SET(sbs, Name) || ! GET_FIELD(sbs, Name).empty() ) {
                    SET_FIELD (sbs, Name, "");
                    ChangeMade(CCleanupChange::eCleanSubsource);
                }
                CLEAN_STRING_MEMBER(sbs, Attrib);
            } else {
                CLEAN_AND_COMPRESS_STRING_MEMBER(sbs, Name);
                if( ! FIELD_IS_SET(sbs, Name) ) {
                    // name must be set
                    SET_FIELD (sbs, Name, "");
                    ChangeMade(CCleanupChange::eCleanSubsource);
                }
                x_RemoveFlankingQuotes( GET_MUTABLE(sbs, Name) );
                CLEAN_STRING_MEMBER(sbs, Attrib);
            }

            if( chs == NCBI_SUBSOURCE(country) ) {
                string &country = GET_MUTABLE(sbs, Name);
                static const string kUSPrefix( "United States:" );
                if( NStr::EqualNocase(country, "United States") || 
                    NStr::EqualNocase(country, "United States of America") || 
                    NStr::EqualNocase(country, "U.S.A.") ) 
                {
                    country = "USA";
                    ChangeMade(CCleanupChange::eCleanSubsource);
                } else if( NStr::StartsWith(country, kUSPrefix, NStr::eNocase) ) {
                    country.replace( 0, kUSPrefix.length(), "USA:" );
                    ChangeMade(CCleanupChange::eCleanSubsource);
                }
            }

            if( chs == NCBI_SUBSOURCE(altitude) ) {
                string &altitude = GET_MUTABLE(sbs, Name);

                // normalize units part (that is, the ending) if possible
                // (e.g. "meters", etc. to "m.")
                // Note that we do NOT count a match if it's just a number because 
                // we can't be sure that the submitter wasn't thinking "feet" or whatever.
                CCachedRegexp altitude_regex = regexpCache.Get(
                    "^([+-]?[0-9]+(\\.[0-9]+)?) ?(m|meter[s]?|metre[s]?)\\.?$",
                    CRegexp::fCompile_ignore_case );

                if( altitude_regex->IsMatch(altitude) ) {
                    string new_altitude = altitude_regex->GetSub(altitude, 1); 
                    new_altitude += " m.";
                    if( altitude != new_altitude ) {
                        altitude = new_altitude;
                        ChangeMade(CCleanupChange::eCleanSubsource);
                    }
                }
            }

            if( chs == NCBI_SUBSOURCE(lat_lon) ) {
                string &lat_lon = GET_MUTABLE(sbs, Name);

                CCachedRegexp lat_lon_with_comma = regexpCache.Get(
                    "^[-.0-9]+ ., [-.0-9]+ .$");
                if( lat_lon_with_comma->IsMatch(lat_lon) ) {
                    // remove the comma
                    SIZE_TYPE comma_pos = lat_lon.find(',');
                    _ASSERT(comma_pos != NPOS );
                    lat_lon.erase(comma_pos, 1);
                    ChangeMade(CCleanupChange::eCleanSubsource);
                }
            }

            if ( chs == NCBI_SUBSOURCE(fwd_primer_seq) ||
                chs == NCBI_SUBSOURCE(rev_primer_seq) )
            {
                const string before = GET_FIELD (sbs, Name);
                CPCRPrimerSeq::Clean( GET_MUTABLE(sbs, Name) );
                const string& after = GET_FIELD (sbs, Name);
                if ( before != after ) {
                    ChangeMade (CCleanupChange::eCleanSubsource);
                }
            }

            // determine whether we should remove this subsource:
            if(  (! FIELD_IS_SET(sbs, Name) || GET_FIELD(sbs, Name).empty()) &&
                ! CSubSource::NeedsNoText( chs ) )
            {
                ERASE_SUBSOURCE_ON_BIOSOURCE(it, biosrc);
                ChangeMade(CCleanupChange::eCleanSubsource);
                continue;
            } else if( chs == NCBI_SUBSOURCE(plastid_name) &&
                STRING_FIELD_MATCH(sbs, Name, s_GenomeToPlastidName(biosrc) ) )
            {
                ERASE_SUBSOURCE_ON_BIOSOURCE(it, biosrc);
                ChangeMade(CCleanupChange::eCleanSubsource);
                continue;
            } else if( prev != SUBSOURCE_ON_BIOSOURCE_Set(biosrc).end() ) {
                TSUBSOURCE_SUBTYPE prev_chs = GET_FIELD (**prev, Subtype);
                const string &name = GET_FIELD(sbs, Name);
                const string &prev_name = GET_FIELD(**prev, Name);

                if ( (chs == prev_chs) &&
                    ( CSubSource::NeedsNoText(chs) ||
                    NStr::EqualNocase(prev_name, name) ||
                    (prev_chs == NCBI_SUBSOURCE(other) &&
                    NStr::Find(prev_name, name) != NPOS))) 
                {
                    ERASE_SUBSOURCE_ON_BIOSOURCE(it, biosrc);
                    ChangeMade(CCleanupChange::eCleanSubsource);
                    continue;
                } else if ( (chs == prev_chs) &&
                    prev_chs == NCBI_SUBSOURCE(other) &&
                    NStr::Find (name, prev_name) != NPOS )
                {
                    (**prev).Assign( sbs );
                    ERASE_SUBSOURCE_ON_BIOSOURCE(it, biosrc);
                    ChangeMade(CCleanupChange::eCleanSubsource);
                    continue;
                }
            }

            prev = it;
        }
    }

    // sort and remove duplicates.
    if (! SUBSOURCE_ON_BIOSOURCE_IS_SORTED (biosrc, s_SubsourceCompare)) {
        SORT_SUBSOURCE_ON_BIOSOURCE (biosrc, s_SubsourceCompare);
        ChangeMade (CCleanupChange::eCleanSubsource);
    }

    if (! SUBSOURCE_ON_BIOSOURCE_IS_UNIQUE (biosrc, s_SubsourceEqual)) {
        UNIQUE_SUBSOURCE_ON_BIOSOURCE (biosrc, s_SubsourceEqual);
        ChangeMade (CCleanupChange::eCleanSubsource);
    }

    // PCR Primers
    if( FIELD_IS_SET(biosrc, Pcr_primers) ) {
        PCRReactionSetBC( GET_MUTABLE(biosrc, Pcr_primers) );
        if( GET_FIELD(biosrc, Pcr_primers).Get().empty() ) {
            RESET_FIELD(biosrc, Pcr_primers);
            ChangeMade(CCleanupChange::eChangePCRPrimers);
        }
    }
}

void CNewCleanup_imp::x_PostBiosource( CBioSource& biosrc )
{
    if( FIELD_EQUALS(biosrc, Genome, NCBI_GENOME(unknown) ) ) {
        RESET_FIELD(biosrc, Genome);
        ChangeMade(CCleanupChange::eChangeBioSourceGenome);
    }

    if (BIOSOURCE_HAS_ORGREF (biosrc)) {
        COrg_ref& org = GET_MUTABLE (biosrc, Org);

        // convert COrg_reg.TMod string to SubSource objects
        EDIT_EACH_MOD_ON_ORGREF (it, org) {
            string& str = *it;
            CRef<CSubSource> sbs (s_StringToSubSource (str));
            if (! sbs) continue;
            ADD_SUBSOURCE_TO_BIOSOURCE (biosrc, sbs);
            ERASE_MOD_ON_ORGREF (it, org);
            ChangeMade (CCleanupChange::eChangeSubsource);
        }

        if( MOD_ON_ORGREF_IS_EMPTY(org) ) {
            RESET_FIELD(org, Mod);
            ChangeMade (CCleanupChange::eRemoveOrgmod);
        }
    }

    if (BIOSOURCE_HAS_SUBSOURCE (biosrc)) {

        // remove plastid-name subsource if the value is the same as the biosource location
        const string &plastid_name = s_GenomeToPlastidName( biosrc );
        
        bool plasmid_subsource_found = false;
        EDIT_EACH_SUBSOURCE_ON_BIOSOURCE (it, biosrc) {
            CSubSource& sbs = **it;
            TSUBSOURCE_SUBTYPE chs = GET_FIELD (sbs, Subtype);
            if (CSubSource::NeedsNoText (chs)) {
                if (sbs.IsSetName() && !NStr::IsBlank(sbs.GetName())) {
                    RESET_FIELD (sbs, Name);
                    SET_FIELD (sbs, Name, "");
                    ChangeMade (CCleanupChange::eCleanSubsource);
                }
            } else if (chs == NCBI_SUBSOURCE(plastid_name)) {
                // plasTid
                if (NStr::EqualNocase (GET_FIELD (sbs, Name), plastid_name)) {
                    ERASE_SUBSOURCE_ON_BIOSOURCE (it, biosrc);
                    ChangeMade (CCleanupChange::eCleanSubsource);
                }
            } else if ( chs == NCBI_SUBSOURCE(plasmid_name) ) {
                // plasMid
                plasmid_subsource_found = true;
            }
        }

        // set genome to "plasmid" under some conditions
        if( plasmid_subsource_found ) {
            if( ! FIELD_IS_SET(biosrc, Genome) || 
                GET_FIELD(biosrc, Genome) == NCBI_GENOME(unknown) || 
                GET_FIELD(biosrc, Genome) == NCBI_GENOME(genomic) ) 
            { 
                biosrc.SetGenome( NCBI_GENOME(plasmid) );
                ChangeMade(CCleanupChange::eChangeBioSourceGenome);
            }
        }

        // remove those with no name unless it has a subtype that doesn't need a name.
        EDIT_EACH_SUBSOURCE_ON_BIOSOURCE (it, biosrc) {
            CSubSource& sbs = **it;
            if (FIELD_IS_SET (sbs, Name) && ! GET_FIELD(sbs, Name).empty() ) continue;
            TSUBSOURCE_SUBTYPE chs = GET_FIELD (sbs, Subtype);
            if (CSubSource::NeedsNoText (chs)) continue;
            ERASE_SUBSOURCE_ON_BIOSOURCE (it, biosrc);
            ChangeMade (CCleanupChange::eCleanSubsource);
        }

        // sort and remove duplicates.
        if (! SUBSOURCE_ON_BIOSOURCE_IS_SORTED (biosrc, s_SubsourceCompare)) {
            SORT_SUBSOURCE_ON_BIOSOURCE (biosrc, s_SubsourceCompare);
            ChangeMade (CCleanupChange::eCleanSubsource);
        }

        if (! SUBSOURCE_ON_BIOSOURCE_IS_UNIQUE (biosrc, s_SubsourceEqual)) {
            UNIQUE_SUBSOURCE_ON_BIOSOURCE (biosrc, s_SubsourceEqual);
            ChangeMade (CCleanupChange::eCleanSubsource);
        }

        REMOVE_IF_EMPTY_SUBSOURCE_ON_BIOSOURCE(biosrc);
    }
}

static COrgMod* s_StringToOrgMod (
    const string& str
)

{
    string::size_type val_start_pos;
    TORGMOD_SUBTYPE subtype = -1;
    if( ! s_StringHasOrgModPrefix( str, val_start_pos, subtype) ) {
        return NULL;
    }

    // we should have split on something non-alphanumeric
    // (equals sign, colon, or whatever)
    if( (val_start_pos < 1) || isalnum( str[val_start_pos-1] ) ) {
        return NULL;
    }

    COrgMod *result = new COrgMod;
    result->SetSubtype( subtype );
    result->SetSubname( str.substr(val_start_pos) );

    return result;
}

static bool s_DbtagIsBad (
    CDbtag& dbt
)

{
    if (! FIELD_IS_SET (dbt, Db)) return true;
    const string& db = GET_FIELD(dbt, Db);
    if (NStr::IsBlank (db)) return true;
    if( NStr::EqualNocase(db, "PID") ||
        NStr::EqualNocase(db, "PIDg") ||
        NStr::EqualNocase(db, "NID") ) {
            return true;
    }

    if (! FIELD_IS_SET( dbt, Tag)) return true;
    const CObject_id& oid = GET_FIELD(dbt, Tag);

    if (FIELD_IS (oid, Id)) {
        if (GET_FIELD (oid, Id) == 0) return true;
    } else if (FIELD_IS (oid, Str)) {
        const string& str = GET_FIELD (oid, Str);
        if (NStr::IsBlank (str)) return true;
    } else return true;

    return false;
}

void CNewCleanup_imp::OrgrefBC (
    COrg_ref& org
)

{
    CLEAN_STRING_MEMBER (org, Taxname);
    CLEAN_STRING_MEMBER (org, Common);
    CLEAN_STRING_LIST (org, Mod);
    CLEAN_STRING_LIST (org, Syn);

    EDIT_EACH_MOD_ON_ORGREF (it, org) {
        string& str = *it;
        CRef<COrgMod> omd (s_StringToOrgMod (str));
        if (! omd) continue;
        ADD_ORGMOD_TO_ORGREF (org, omd);
        ERASE_MOD_ON_ORGREF (it, org);
        ChangeMade (CCleanupChange::eChangeOrgmod);
    }
    if ( RAW_FIELD_IS_EMPTY(org, Mod) ) {
        RESET_FIELD (org, Mod);
        ChangeMade (CCleanupChange::eChangeOrgmod);
    }

    if (FIELD_IS_SET (org, Orgname)) {
        COrgName& onm = GET_MUTABLE (org, Orgname);
        OrgnameBC (onm, org);
    }


    if (ORGREF_HAS_DBXREF (org)) {
        
        vector< CRef< CDbtag > > new_dbtags;
        EDIT_EACH_DBXREF_ON_ORGREF (it, org) {
            CDbtag& dbt = **it;
            x_SplitDbtag(dbt, new_dbtags );
        }
        if( ! new_dbtags.empty() ) {
            copy( new_dbtags.begin(), new_dbtags.end(), back_inserter( org.SetDb() ) );
            ChangeMade (CCleanupChange::eChangeDbxrefs);
        }
    }
}

void CNewCleanup_imp::x_PostOrgRef( COrg_ref& org )
{
    EDIT_EACH_DBXREF_ON_ORGREF (it, org) {
        CDbtag& dbt = **it;
        if (s_DbtagIsBad (dbt)) {
            ERASE_DBXREF_ON_ORGREF (it, org);
            ChangeMade (CCleanupChange::eCleanDbxrefs);
        }
    }

    // sort/unique db_xrefs
    if (! DBXREF_ON_ORGREF_IS_SORTED (org, s_DbtagCompare)) {
        SORT_DBXREF_ON_ORGREF (org, s_DbtagCompare);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }
    if (! DBXREF_ON_ORGREF_IS_UNIQUE (org, s_DbtagEqual)) {
        UNIQUE_DBXREF_ON_ORGREF (org, s_DbtagEqual);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }

    // sort/unique syns
    if (! SYN_ON_ORGREF_IS_SORTED (org, s_OrgrefSynCompare)) {
        SORT_SYN_ON_ORGREF (org, s_OrgrefSynCompare);
        ChangeMade (CCleanupChange::eCleanOrgref);
    }
    if (! SYN_ON_ORGREF_IS_UNIQUE (org, s_OrgrefSynEqual)) {
        UNIQUE_SYN_ON_ORGREF (org, s_OrgrefSynEqual);
        ChangeMade (CCleanupChange::eCleanOrgref);
    }
}

// is om1 < om2
// to sort subtypes together.

static bool s_OrgModCompare (
    const CRef<COrgMod>& om1,
    const CRef<COrgMod>& om2
)

{
    const COrgMod& omd1 = *(om1);
    const COrgMod& omd2 = *(om2);

    // subtype comparison
    TORGMOD_SUBTYPE subtype1 = GET_FIELD (omd1, Subtype);
    TORGMOD_SUBTYPE subtype2 = GET_FIELD (omd2, Subtype);
    if (subtype1 < subtype2) return true;
    if (subtype1 > subtype2) return false;

    // subname comparison
    const string& subname1 = GET_FIELD (omd1, Subname);
    const string& subname2 = GET_FIELD (omd2, Subname);
    const int subname_comparison = NStr::CompareNocase( subname1, subname2 );
    if( subname_comparison < 0 ) {
        return true;
    } else if( subname_comparison > 0 ) {
        return false;
    }

    // attrib comparison (realistically, we don't expect to fall back to this)
    const string& attrib1 = ( FIELD_IS_SET(omd1, Attrib) ? GET_FIELD (omd1, Attrib) : kEmptyStr );
    const string& attrib2 = ( FIELD_IS_SET(omd2, Attrib) ? GET_FIELD (omd2, Attrib) : kEmptyStr );

    return NStr::CompareNocase( attrib1, attrib2 ) < 0;
}

// Two OrgMod's are equal and duplicates if:
// they have the same subname and same subtype
// or one has subtype 'other'.

static bool s_OrgModEqual (
    const CRef<COrgMod>& om1,
    const CRef<COrgMod>& om2
)

{
    const COrgMod& omd1 = *(om1);
    const COrgMod& omd2 = *(om2);

    const string& subname1 = GET_FIELD (omd1, Subname);
    const string& subname2 = GET_FIELD (omd2, Subname);
    if (! NStr::EqualNocase (subname1, subname2)) return false;

    const string& attrib1 = ( FIELD_IS_SET(omd1, Attrib) ? GET_FIELD (omd1, Attrib) : kEmptyStr );
    const string& attrib2 = ( FIELD_IS_SET(omd2, Attrib) ? GET_FIELD (omd2, Attrib) : kEmptyStr );
    if (! NStr::EqualNocase (attrib1, attrib2)) return false;

    TORGMOD_SUBTYPE chs1 = GET_FIELD (omd1, Subtype);
    TORGMOD_SUBTYPE chs2 = GET_FIELD (omd2, Subtype);
    if (chs1 == chs2) return true;
    if ( chs1 == NCBI_ORGMOD(other) || chs2 == NCBI_ORGMOD(other)) return true;

    return false;
}

void CNewCleanup_imp::OrgnameBC (
    COrgName& onm, COrg_ref &org_ref
)

{
    CLEAN_STRING_MEMBER (onm, Attrib);
    CLEAN_STRING_MEMBER (onm, Lineage);
    CLEAN_STRING_MEMBER_JUNK (onm, Div);

    EDIT_EACH_ORGMOD_ON_ORGNAME (it, onm) {
        COrgMod& omd = **it;
        OrgmodBC (omd);
        if (! FIELD_IS_SET (omd, Subname) || NStr::IsBlank (GET_FIELD (omd, Subname))) {
            ERASE_ORGMOD_ON_ORGNAME (it, onm);
            ChangeMade (CCleanupChange::eRemoveOrgmod);
        }
    }

    // special value fixes
    EDIT_EACH_ORGMOD_ON_ORGNAME (it, onm) {
        COrgMod& omd = **it;
        switch (omd.GetSubtype()) {
            case COrgMod::eSubtype_bio_material:
                if (COrgMod::FixStructuredVoucher(omd.SetSubname(), "b")) {
                    ChangeMade (CCleanupChange::eChangeOrgmod);
                }
                break;
            case COrgMod::eSubtype_culture_collection:
                if (COrgMod::FixStructuredVoucher(omd.SetSubname(), "c")) {
                    ChangeMade (CCleanupChange::eChangeOrgmod);
                }
                break;
            case COrgMod::eSubtype_specimen_voucher:
                if (COrgMod::FixStructuredVoucher(omd.SetSubname(), "s")) {
                    ChangeMade (CCleanupChange::eChangeOrgmod);
                }
                break;
            default:
                break;
        }
    }

    // erase structured notes that already match value
    // (Note: This is O(N^2).  Maybe worth converting to a faster algo?)
    EDIT_EACH_ORGMOD_ON_ORGNAME (it, onm) {
        COrgMod& omd = **it;
        if (omd.GetSubtype() == NCBI_ORGMOD(other)) {
            bool do_erase = false;
            string val_name, otherval;
            NStr::SplitInTwo( omd.GetSubname(), " =:", val_name, otherval );
            try {
                COrgMod::TSubtype subtype = COrgMod::GetSubtypeValue(val_name);
                NStr::TruncateSpacesInPlace(otherval);                
                FOR_EACH_ORGMOD_ON_ORGNAME (match_it, onm) {
                    if ((*match_it)->GetSubtype() == subtype
                        && NStr::EqualCase((*match_it)->GetSubname(), otherval)) {
                        do_erase = true;
                        break;
                    }
                }
            } catch (CSerialException& ) {
            }

            if (do_erase) {
                ERASE_ORGMOD_ON_ORGNAME (it, onm);
                ChangeMade (CCleanupChange::eCleanOrgmod);
            }
        }
    }

    if (! ORGMOD_ON_ORGNAME_IS_SORTED (onm, s_OrgModCompare)) {
        SORT_ORGMOD_ON_ORGNAME (onm, s_OrgModCompare);
        ChangeMade (CCleanupChange::eCleanOrgmod);
    }

    // clean Orgmod list
    x_OrgnameModBC( onm, GET_STRING_FLD_OR_BLANK(org_ref, Common) );

    if (! ORGMOD_ON_ORGNAME_IS_SORTED (onm, s_OrgModCompare)) {
        SORT_ORGMOD_ON_ORGNAME (onm, s_OrgModCompare);
        ChangeMade (CCleanupChange::eCleanOrgmod);
    }

    if (! ORGMOD_ON_ORGNAME_IS_UNIQUE (onm, s_OrgModEqual)) {
        UNIQUE_ORGMOD_ON_ORGNAME (onm, s_OrgModEqual);
        ChangeMade (CCleanupChange::eCleanOrgmod);
    }

    REMOVE_IF_EMPTY_ORGMOD_ON_ORGNAME(onm);
}

static bool RemoveSpaceBeforeAndAfterColon (
    string& str
)

{
    // May need to create a custom implementation if this
    // regex becomes a bottleneck
    return s_RegexpReplace( str, "[ ]*:[ ]*", ":");
}

void CNewCleanup_imp::OrgmodBC (
    COrgMod& omd
)
{
    CLEAN_AND_COMPRESS_STRING_MEMBER (omd, Subname);
    if (FIELD_IS_SET (omd, Subname)) {
        x_RemoveFlankingQuotes( GET_MUTABLE(omd, Subname) );
    }

    CLEAN_AND_COMPRESS_STRING_MEMBER (omd, Attrib);

    TORGMOD_SUBTYPE subtype = GET_FIELD (omd, Subtype);

    if( subtype == NCBI_ORGMOD(specimen_voucher) ||
        subtype == NCBI_ORGMOD(culture_collection) ||
        subtype == NCBI_ORGMOD(bio_material) )
    {
        if (FIELD_IS_SET (omd, Subname)) {
            string &subname = GET_MUTABLE (omd, Subname);
            const string::size_type old_len = subname.length();
            RemoveSpaceBeforeAndAfterColon (subname);
            NStr::ReplaceInPlace( subname, "::", ":", 0, 1 );
            if( old_len != subname.length() ) {
                ChangeMade (CCleanupChange::eTrimSpaces);
            }
        }
    }
}

void CNewCleanup_imp::DbtagBC (
    CDbtag& dbtag
)

{
    if (! FIELD_IS_SET (dbtag, Db)) return;
    if (! FIELD_IS_SET (dbtag, Tag)) return;

    string& db = GET_MUTABLE (dbtag, Db);
    if (NStr::IsBlank (db)) return;

    x_CleanupStringMarkChanged( db );

    if (NStr::EqualNocase(db, "Swiss-Prot")
        || NStr::EqualNocase (db, "SWISSPROT")) {
        db = "UniProtKB/Swiss-Prot";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "SPTREMBL")  ||
               NStr::EqualNocase(db, "TrEMBL") ) {
        db = "UniProtKB/TrEMBL";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "SUBTILIS")) {
        db = "SubtiList";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "LocusID")) {
        db = "GeneID";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "MaizeDB")) {
        db = "MaizeGDB";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "GeneW")) {
        db = "HGNC";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "MGD")) {
        db = "MGI";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "IFO")) {
        db = "NBRC";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "BHB") ||
        NStr::EqualNocase(db, "BioHealthBase")) {
        db = "IRD";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::Equal(db, "GENEDB")) {
        db = "GeneDB";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::Equal(db, "cdd")) {
        db = "CDD";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::Equal(db, "FlyBase")) {
        db = "FLYBASE";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "GreengenesID")) {
        db = "Greengenes";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "HMPID")) {
        db = "HMP";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    }

    CObject_id& oid = GET_MUTABLE (dbtag, Tag);

    if (FIELD_IS (oid, Id)) {
        const string& db = dbtag.GetDb();
        if (NStr::EqualNocase (db, "HGNC") || NStr::EqualNocase (db, "MGI") ) {
            int val = dbtag.GetTag().GetId();
            string str = db + ":" + NStr::IntToString(val);
            dbtag.SetTag().SetStr(str);
            ChangeMade(CCleanupChange::eChangeDbxrefs);
        }
        return;
    }

    if (! FIELD_IS (oid, Str)) return;

    string& str = GET_MUTABLE(oid, Str);
    if (NStr::IsBlank (str)) return;
    x_CleanupStringMarkChanged( str );

    if (NStr::EqualNocase(dbtag.GetDb(), "HPRD") && NStr::StartsWith (dbtag.GetTag().GetStr(), "HPRD_")) {
        dbtag.SetTag().SetStr (dbtag.GetTag().GetStr().substr (5));
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase (dbtag.GetDb(), "MGI") ) {
        if(NStr::StartsWith (dbtag.GetTag().GetStr(), "MGI:") || NStr::StartsWith (dbtag.GetTag().GetStr(), "MGD:")) {
            /*
            dbtag.SetTag().SetStr (dbtag.GetTag().GetStr().substr (4));
            ChangeMade(CCleanupChange::eChangeDbxrefs);
            */
        } else if( NStr::StartsWith( dbtag.GetTag().GetStr(), "J:", NStr::eNocase ) ) {
            dbtag.SetTag().SetStr("J");
            ChangeMade(CCleanupChange::eChangeDbxrefs);
        }
    } else if (NStr::EqualNocase (dbtag.GetDb(), "HGNC") ) {
        if(NStr::StartsWith (dbtag.GetTag().GetStr(), "HGNC:")) {
            /*
            dbtag.SetTag().SetStr (dbtag.GetTag().GetStr().substr (5));
            ChangeMade(CCleanupChange::eChangeDbxrefs);
            */
        }
    } else if (NStr::EqualNocase (dbtag.GetDb(), "RGD") ) {
        if(NStr::StartsWith (dbtag.GetTag().GetStr(), "RGD:")) {
            dbtag.SetTag().SetStr (dbtag.GetTag().GetStr().substr (4));
            ChangeMade(CCleanupChange::eChangeDbxrefs);
        }
    }

    bool all_zero = true;
    FOR_EACH_CHAR_IN_STRING (it, str) {
        const char& ch = *it;
        if (isdigit((unsigned char)(ch))) {
            if (ch != '0') {
                all_zero = false;
            }
        } else if (!isspace((unsigned char)(ch))) {
            return;
        }
    }
    
    if (str[0] != '0'  ||  all_zero) {
        try {
            // extract the part before the first space for conversion
            string::size_type pos_of_first_space = 0;
            while( pos_of_first_space < str.length() && ! isspace(str[pos_of_first_space]) ) {
                ++pos_of_first_space;
            }
            CTempString sStrOfNum(str, 0, pos_of_first_space);

            // only convert str to int if it fits into the non-negative side
            // of an int.
            int value = NStr::StringToInt(sStrOfNum, NStr::fConvErr_NoThrow);
            if( value > 0 || (value == 0 && all_zero) ) {
                SET_FIELD ( oid, Id, NStr::StringToUInt(sStrOfNum) );
                ChangeMade (CCleanupChange::eChangeDbxrefs);
            }
        } catch (CStringException&) {
            // just leave things as are
        }
    }
}

void CNewCleanup_imp::PubdescBC (
    CPubdesc& pubdesc
)
{
    if ( FIELD_IS_SET(pubdesc, Comment)) {
        x_ConvertDoubleQuotesMarkChanged( GET_MUTABLE(pubdesc, Comment) );
    }

    CLEAN_STRING_MEMBER(pubdesc, Comment);

    if ( FIELD_IS_SET(pubdesc, Pub) ) {
        PubEquivBC( GET_MUTABLE(pubdesc, Pub) );
    }
}

static bool s_ShouldWeFixInitials(const CPub_equiv& equiv)
{
    bool has_id  = false, 
    has_art = false;
    
    FOR_EACH_PUB_ON_PUBEQUIV(pub_iter, equiv) {
        if ( ( (*pub_iter)->IsPmid() && (*pub_iter)->GetPmid() > 0 ) ||
             ( (*pub_iter)->IsMuid() && (*pub_iter)->GetMuid() > 0 ) ) {
            has_id = true;
        } else if ((*pub_iter)->IsArticle()) {
            has_art = true;
        }
    }
    return !(has_art  &&  has_id);
}

void CNewCleanup_imp::PubEquivBC (CPub_equiv& pub_equiv)
{
    x_FlattenPubEquiv(pub_equiv);

    // we keep the last of these because we might transfer one
    // to the other as necessary to fill in gaps.
    int last_pmid = 0;
    int last_article_pubmed_id = 0; // the last from a journal
    CRef<CCit_art> last_article;
    
    bool fix_initials = s_ShouldWeFixInitials(pub_equiv);
    EDIT_EACH_PUB_ON_PUBEQUIV(it, pub_equiv) {
        CPub &pub = **it;

        if( PubBC(pub, fix_initials) == eAction_Erase ) {
            ERASE_PUB_ON_PUBEQUIV(it, pub_equiv);
            ChangeMade(CCleanupChange::eRemoveEmptyPub);
            continue;
        }

        // storing these so at the end we'll know the last values
        if( pub.IsPmid() ) {
            last_pmid = pub.GetPmid().Get();
        }
        if( pub.IsArticle() ) {
            last_article.Reset( &pub.SetArticle());
            if( FIELD_IS_SET_AND_IS(*last_article, From, Journal) && 
                FIELD_IS_SET(*last_article, Ids) ) 
            {
                FOR_EACH_ARTICLEID_ON_CITART( id_iter, *last_article ) {
                    const CArticleId &article_id = **id_iter;
                    if( article_id.IsPubmed() ) {
                        last_article_pubmed_id = article_id.GetPubmed().Get();
                    }
                }
            }
        }
    }

    // Now, we might have to transfer data to fill in missing information
    if (last_pmid == 0 && last_article_pubmed_id > 0) {
        CRef<CPub> new_pub( new CPub );
        new_pub->SetPmid().Set( last_article_pubmed_id );
        pub_equiv.Set().push_back( new_pub );
        ChangeMade(CCleanupChange::eChangePublication);
    } else if (last_pmid > 0 && last_article_pubmed_id == 0 && last_article ) {
        CRef<CArticleId> new_article_id( new CArticleId );
        new_article_id->SetPubmed().Set( last_pmid );
        last_article->SetIds().Set().push_back( new_article_id );
        ChangeMade(CCleanupChange::eChangePublication);
    }

    /*
    last_article_pubmed_id = 0;
    int article_pubmed_id = 0;
    EDIT_EACH_ARTICLEID_ON_CITART( id_iter, *last_article ) {
        const CArticleId &article_id = **id_iter;
        if( article_id.IsPubmed() ) {
            article_pubmed_id = article_id.GetPubmed().Get();
            if (last_article_pubmed_id > 0 && last_article_pubmed_id == article_pubmed_id) {
                ERASE_ARTICLEID_ON_CITART( id_iter, *last_article );
            }
            last_article_pubmed_id = article_pubmed_id;
        }
    }
    */
}

CNewCleanup_imp::EAction CNewCleanup_imp::PubBC(CPub& pub, bool fix_initials)
{
    EAction action = eAction_Nothing;

#define PUBBC_CASE(cit_type, func) \
    case NCBI_PUB(cit_type): \
        action = func( GET_MUTABLE(pub, cit_type), fix_initials); \
        break;

    switch (pub.Which()) {
    PUBBC_CASE(Gen, CitGenBC)
    PUBBC_CASE(Sub, CitSubBC)
    PUBBC_CASE(Article, CitArtBC)
    PUBBC_CASE(Book, CitBookBC)
    PUBBC_CASE(Patent, CitPatBC)
    PUBBC_CASE(Man, CitLetBC)
    PUBBC_CASE(Medline, MedlineEntryBC)
    default:
        action = eAction_Nothing;
    }
#undef PUBBC_CASE

    string new_label;
    pub.GetLabel( &new_label, CPub::eContent, true);
    m_PubToNewPubLabelMap[CRef<CPub>(&pub)] = new_label;

    return action;
}

static bool s_IsEmpty(const CAuth_list::TAffil& affil)
{
    if ( FIELD_IS(affil, Str) ) {
        return NStr::IsBlank( GET_FIELD(affil, Str) );
    } else if ( FIELD_IS(affil, Std) ) {
        const CAuth_list::TAffil::TStd& std = GET_FIELD(affil, Std);
        return !(std.IsSetAffil()  ||  std.IsSetDiv()      ||  std.IsSetCity()    ||
                 std.IsSetSub()    ||  std.IsSetCountry()  ||  std.IsSetStreet()  ||
                 std.IsSetEmail()  ||  std.IsSetFax()      ||  std.IsSetPhone()   ||
                 std.IsSetPostal_code());
    }
    return true;
}

static
bool s_IsEmpty( const CCit_gen &cg )
{
    return ( ! FIELD_IS_SET(cg, Cit) || GET_FIELD(cg, Cit).empty() ) &&
        ! FIELD_IS_SET(cg, Authors) &&
        ( ! FIELD_IS_SET(cg, Muid) || GET_FIELD(cg, Muid) <= 0 ) &&
        ! FIELD_IS_SET(cg, Journal) &&
        ( ! FIELD_IS_SET(cg, Volume) || GET_FIELD(cg, Volume).empty() ) &&
        ( ! FIELD_IS_SET(cg, Issue) || GET_FIELD(cg, Issue).empty() ) &&
        ( ! FIELD_IS_SET(cg, Pages) || GET_FIELD(cg, Pages).empty() ) &&
        ! FIELD_IS_SET(cg, Date) &&
        ( ! FIELD_IS_SET(cg, Serial_number) || GET_FIELD(cg, Serial_number) <= 0 ) &&
        ( ! FIELD_IS_SET(cg, Title) || GET_FIELD(cg, Title).empty() ) &&
        ( ! FIELD_IS_SET(cg, Pmid) || GET_FIELD(cg, Pmid) <= 0 );
}

CNewCleanup_imp::EAction CNewCleanup_imp::CitGenBC(CCit_gen& cg, bool fix_initials)
{
    if( FIELD_IS_SET(cg, Authors) ) {
        AuthListBC( GET_MUTABLE(cg, Authors), fix_initials );
    }
    if ( FIELD_IS_SET(cg, Cit) ) {
        CCit_gen::TCit& cit = GET_MUTABLE( cg, Cit );
        if (NStr::StartsWith(cit, "unpublished", NStr::eNocase) && cit[0] != 'U' ) {
            cit[0] = 'U';
            ChangeMade(CCleanupChange::eChangePublication);
        }
        if (! FIELD_IS_SET(cg, Journal) 
            && ( FIELD_IS_SET(cg, Volume) || FIELD_IS_SET(cg, Pages) || FIELD_IS_SET(cg, Issue))) 
        {
            RESET_FIELD(cg, Volume);
            RESET_FIELD(cg, Pages);
            RESET_FIELD(cg, Issue);
            ChangeMade(CCleanupChange::eChangePublication);
        }
        const size_t old_cit_size = cit.size();
        NStr::TruncateSpacesInPlace(cit);
        if (old_cit_size != cit.size()) {
            ChangeMade(CCleanupChange::eChangePublication);
        }
    }
    if ( FIELD_IS_SET(cg, Pages) ) {
        if (RemoveSpaces( GET_MUTABLE(cg, Pages) ) ) {
            ChangeMade(CCleanupChange::eChangePublication);
        }
    }

    // title strstripspaces (see 8728 in sqnutil1.c, Mar 11, 2011)
    if( FIELD_IS_SET(cg, Title) ) {
        x_StripSpacesMarkChanged( GET_MUTABLE(cg, Title) );
    }

    if( m_SeqEntryInfoStack.top().m_StripSerial ) {
        RESET_FIELD( cg, Serial_number );
        ChangeMade(CCleanupChange::eStripSerial);
    }

    // erase if the Cit-gen is now entirely blank
    return ( s_IsEmpty(cg) ? eAction_Erase : eAction_Nothing );
}

CNewCleanup_imp::EAction CNewCleanup_imp::CitSubBC(CCit_sub& citsub, bool fix_initials)
{
    CRef<CCit_sub::TAuthors> authors;
    if ( FIELD_IS_SET(citsub, Authors) ) {
        authors.Reset(& GET_MUTABLE(citsub, Authors) );
        AuthListBC( *authors, fix_initials);
    }
    
    if ( FIELD_IS_SET(citsub, Imp) ) {
        CCit_sub::TImp& imp =  GET_MUTABLE(citsub, Imp);
        if (authors  &&  ! FIELD_IS_SET(*authors, Affil)  &&  FIELD_IS_SET(imp, Pub) ) {
            SET_FIELD(*authors, Affil, GET_MUTABLE(imp, Pub) );
            RESET_FIELD(imp, Pub);
            ChangeMade(CCleanupChange::eChangePublication);
        }
        if (! FIELD_IS_SET(citsub, Date)  &&  FIELD_IS_SET(imp, Date) ) {
            GET_MUTABLE(citsub, Date).Assign( GET_FIELD(imp, Date) );
            RESET_FIELD(citsub, Imp);
            ChangeMade(CCleanupChange::eChangePublication);
        }
    }
    if (authors  &&  FIELD_IS_SET(*authors, Affil) ) {
        CCit_sub::TAuthors::TAffil& affil = GET_MUTABLE(*authors, Affil);
        if ( FIELD_IS(affil, Str) ) {
            string str = GET_MUTABLE(affil, Str);
            if (NStr::StartsWith(str, "to the ", NStr::eNocase) &&
                str.size() >= 34 &&
                NStr::StartsWith(str.substr(24), " databases", NStr::eNocase) ) {
                if ( str.size() > 34 && str[34] == '.') {
                    str = str.substr(35);
                } else {
                    str = str.substr(34);
                }
                SET_FIELD(affil, Str, str);
                ChangeMade(CCleanupChange::eChangePublication);
                AffilBC(affil);
                if ( s_IsEmpty(affil) ) {
                    RESET_FIELD(*authors, Affil);
                    ChangeMade(CCleanupChange::eChangePublication);
                }
            }
        }
    }

    return eAction_Nothing;
}

CNewCleanup_imp::EAction CNewCleanup_imp::CitArtBC(CCit_art& citart, bool fix_initials)
{
    if ( FIELD_IS_SET(citart, Authors) ) {
        AuthListBC( GET_MUTABLE(citart, Authors), fix_initials);
    }
    if ( FIELD_IS_SET(citart, From) ) {
        CCit_art::TFrom& from = GET_MUTABLE(citart, From);
        if ( FIELD_IS(from, Book) ) {
            CitBookBC(GET_MUTABLE(from, Book), fix_initials);
        } else if ( FIELD_IS(from, Proc) ) {
            CitProcBC( GET_MUTABLE(from, Proc), fix_initials);
        } else if (FIELD_IS(from, Journal) ) {
            CitJourBC(GET_MUTABLE(from, Journal), fix_initials);
        }
    }

    return eAction_Nothing;
}

CNewCleanup_imp::EAction CNewCleanup_imp::CitBookBC(CCit_book& citbook, bool fix_initials)
{
    if ( FIELD_IS_SET(citbook, Authors) ) {
        AuthListBC( GET_MUTABLE(citbook, Authors), fix_initials);
    }
    if ( FIELD_IS_SET(citbook, Imp) ) {
        ImprintBC( GET_MUTABLE(citbook, Imp), eImprintBC_ForbidStatusChange );
    }

    return eAction_Nothing;
}

CNewCleanup_imp::EAction CNewCleanup_imp::CitPatBC(CCit_pat& citpat, bool fix_initials)
{
    if ( FIELD_IS_SET(citpat, Authors) ) {
        AuthListBC( GET_MUTABLE(citpat, Authors), fix_initials);
    }
    if ( FIELD_IS_SET(citpat, Applicants) ) {
        AuthListBC( GET_MUTABLE(citpat, Applicants), fix_initials);
    }
    if ( FIELD_IS_SET(citpat, Assignees) ) {
        AuthListBC( GET_MUTABLE(citpat, Assignees), fix_initials);
    }

    return eAction_Nothing;
}

CNewCleanup_imp::EAction CNewCleanup_imp::CitLetBC(CCit_let& citlet, bool fix_initials)
{
    if ( FIELD_IS_SET(citlet, Cit) && FIELD_EQUALS( citlet, Type, CCit_let::eType_thesis ) ) {
        CitBookBC( GET_MUTABLE(citlet, Cit), fix_initials);
    }

    return eAction_Nothing;
}

CNewCleanup_imp::EAction CNewCleanup_imp::CitProcBC(CCit_proc& citproc, bool fix_initials)
{
    if ( FIELD_IS_SET(citproc, Book) ) {
        CitBookBC( GET_MUTABLE(citproc, Book), fix_initials);
    }

    return eAction_Nothing;
}

CNewCleanup_imp::EAction CNewCleanup_imp::CitJourBC(CCit_jour &citjour, bool fix_initials)
{
    if ( FIELD_IS_SET(citjour, Imp) ) {
        ImprintBC( GET_MUTABLE(citjour, Imp), eImprintBC_AllowStatusChange );
    }

    return eAction_Nothing;
}

CNewCleanup_imp::EAction CNewCleanup_imp::MedlineEntryBC(CMedline_entry& medline, bool fix_initials)
{
    if ( ! FIELD_IS_SET(medline, Cit) || ! FIELD_IS_SET(medline.GetCit(), Authors) ) {
        return eAction_Nothing;
    }
    AuthListBC( GET_MUTABLE(medline.SetCit(), Authors), fix_initials );

    return eAction_Nothing;
}

static bool s_IsEmpty(const CAuthor& auth)
{
    if (! FIELD_IS_SET(auth, Name)) {
        return true;
    }
    
    const CAuthor::TName& name = GET_FIELD(auth, Name);
    
    const string* str = NULL;
    switch (name.Which()) {
        case CAuthor::TName::e_not_set:
            return true;
            
        case CAuthor::TName::e_Name:
        {{
            const CName_std& nstd = name.GetName();
            // last name is required
            if( (!nstd.IsSetLast()      ||  NStr::IsBlank(nstd.GetLast())) ) {
                return true;
            }
            // also fails if all fields are blank
            if ((!nstd.IsSetLast()      ||  NStr::IsBlank(nstd.GetLast()))      &&
                (!nstd.IsSetFirst()     ||  NStr::IsBlank(nstd.GetFirst()))     &&
                (!nstd.IsSetMiddle()    ||  NStr::IsBlank(nstd.GetMiddle()))    &&
                (!nstd.IsSetFull()      ||  NStr::IsBlank(nstd.GetFull()))      &&
                (!nstd.IsSetInitials()  ||  NStr::IsBlank(nstd.GetInitials()))  &&
                (!nstd.IsSetSuffix()    ||  NStr::IsBlank(nstd.GetSuffix()))    &&
                (!nstd.IsSetTitle()     ||  NStr::IsBlank(nstd.GetTitle()))) {
                return true;
            }
            break;
        }}
            
        case CAuthor::TName::e_Ml:
            str = &GET_FIELD(name, Ml);
            break;
        case CAuthor::TName::e_Str:
            str = &GET_FIELD(name, Str);
            break;
        case CAuthor::TName::e_Consortium:
            str = &GET_FIELD(name, Consortium);
            break;
            
        default:
            break;
    };
    if (str != NULL  &&  NStr::IsBlank(*str)) {
        return true;
    }
    return false;
}

// when we reset author names, we need to put in a place holder - otherwise the ASN.1 becomes invalid
static
void s_ResetAuthorNames (CAuth_list::TNames& names) 
{
    names.Reset();
    list< string > &auth_list = names.SetStr();
    auth_list.clear();
    auth_list.push_back("?");
}

void CNewCleanup_imp::AuthListBC( CAuth_list& al, bool fix_initials )
{
    if ( FIELD_IS_SET(al, Affil) ) {
        AffilBC( GET_MUTABLE(al, Affil) );
        if (s_IsEmpty( GET_FIELD(al, Affil) )) {
            RESET_FIELD(al, Affil);
            ChangeMade(CCleanupChange::eChangePublication);
        }
    }
    if ( FIELD_IS_SET(al, Names) ) {
        typedef CAuth_list::TNames TNames;
        switch ( GET_MUTABLE(al, Names).Which() ) {
            case TNames::e_Ml:
            {{
                if (ConvertAuthorContainerMlToStd(al)) {
                    ChangeMade(CCleanupChange::eChangePublication);
                }
            }}
            // !!!!!!!!!!!!!!!!!!!!!!
            // !!!!!FALL-THROUGH!!!!!
            // !!!!!!!!!!!!!!!!!!!!!!
            // ( since we just converted the ml to an std, we need to do the
            //   std clean-up step )
            case TNames::e_Std:
            {{
                // The "names" variable is not above the switch() because
                // the case fall-through means it may have been invalidated.
                TNames& names = GET_MUTABLE(al, Names);
                // call BasicCleanup for each CAuthor
                EDIT_EACH_AUTHOR_ON_AUTHLIST( it, al ) {
                    x_AuthorBC(**it, fix_initials);
                    if( s_IsEmpty(**it) ) {
                        ERASE_AUTHOR_ON_AUTHLIST( it, al );
                        ChangeMade(CCleanupChange::eChangePublication);
                    }
                }
                if ( AUTHOR_ON_AUTHLIST_IS_EMPTY(al) ) {
                    s_ResetAuthorNames (names);
                    ChangeMade(CCleanupChange::eChangePublication);
                }
                break;
            }}
            case TNames::e_Str:
            {{
                TNames& names = GET_MUTABLE(al, Names);
                EDIT_EACH_STRING_IN_LIST( str_iter, GET_MUTABLE(names, Str) ) {
                    x_CompressStringSpacesMarkChanged(*str_iter);
                }
                if (CleanVisStringContainer( GET_MUTABLE(names, Str) )) {
                    ChangeMade(CCleanupChange::eChangePublication);
                }
                if (names.GetStr().empty()) {
                    s_ResetAuthorNames (names);
                    ChangeMade(CCleanupChange::eChangePublication);
                }
                break;
            }}
            default:
                break;
        }
    }
    // if no remaining authors, put in default author for legal ASN.1
    if (! FIELD_IS_SET(al, Names) ) {
        al.SetNames().SetStr().push_back("?");
        ChangeMade(CCleanupChange::eChangePublication);
    }
}

void CNewCleanup_imp::AffilBC( CAffil& af )
{
    switch (af.Which()) {
        case CAffil::e_Str:
        {{
            x_CompressStringSpacesMarkChanged( GET_MUTABLE(af, Str) );
            x_CleanupStringMarkChanged( GET_MUTABLE(af, Str) );
            break;
        }}
        case CAffil::e_Std:
        {{
            CAffil::TStd& std = GET_MUTABLE(af, Std);

            CLEAN_AND_COMPRESS_STRING_MEMBER_JUNK(std, Affil);
            CLEAN_AND_COMPRESS_STRING_MEMBER_JUNK(std, Div);
            CLEAN_AND_COMPRESS_STRING_MEMBER_JUNK(std, City);
            CLEAN_AND_COMPRESS_STRING_MEMBER_JUNK(std, Sub);
            CLEAN_AND_COMPRESS_STRING_MEMBER_JUNK(std, Country);
            CLEAN_AND_COMPRESS_STRING_MEMBER_JUNK(std, Street);
            CLEAN_AND_COMPRESS_STRING_MEMBER_JUNK(std, Email);
            CLEAN_AND_COMPRESS_STRING_MEMBER_JUNK(std, Fax);
            CLEAN_AND_COMPRESS_STRING_MEMBER_JUNK(std, Phone);
            CLEAN_AND_COMPRESS_STRING_MEMBER_JUNK(std, Postal_code);

            if (std.CanGetCountry() ) {
                if ( NStr::EqualNocase(std.GetCountry(), "U.S.A.") ) {
                    SET_FIELD( std, Country, "USA");
                    ChangeMade (CCleanupChange::eChangePublication);
                }
            }
            break;
        }}
        default:
            break;
    }
}

void CNewCleanup_imp::ImprintBC( CImprint& imprint, EImprintBC is_status_change_allowed )
{
    if( is_status_change_allowed == eImprintBC_AllowStatusChange ) {
        if ( FIELD_EQUALS(imprint, Pubstatus, ePubStatus_aheadofprint) &&
            (! FIELD_EQUALS(imprint, Prepub, CImprint::ePrepub_in_press) ) )
        {
            if (!imprint.IsSetVolume() || NStr::IsBlank (imprint.GetVolume())
                || !imprint.IsSetPages() || NStr::IsBlank (imprint.GetPages())) {
                    SET_FIELD(imprint, Prepub, CImprint::ePrepub_in_press);
                    ChangeMade (CCleanupChange::eChangePublication);
            }
        }
        if (FIELD_EQUALS(imprint, Pubstatus, ePubStatus_aheadofprint) &&
            FIELD_EQUALS(imprint, Prepub, CImprint::ePrepub_in_press) )
        {
            if (imprint.IsSetVolume() && !NStr::IsBlank (imprint.GetVolume())
                && imprint.IsSetPages() && !NStr::IsBlank (imprint.GetPages())) {
                    RESET_FIELD(imprint, Prepub);
                    ChangeMade (CCleanupChange::eChangePublication);
            }
        }

        if (FIELD_EQUALS(imprint, Pubstatus, ePubStatus_epublish) &&
            FIELD_EQUALS(imprint, Prepub, CImprint::ePrepub_in_press) ) {
                RESET_FIELD(imprint, Prepub);
                ChangeMade (CCleanupChange::eChangePublication);
        }
    }

    CLEAN_AND_COMPRESS_STRING_MEMBER(imprint, Volume);
    CLEAN_AND_COMPRESS_STRING_MEMBER(imprint, Issue);
    CLEAN_AND_COMPRESS_STRING_MEMBER(imprint, Pages);
    CLEAN_AND_COMPRESS_STRING_MEMBER(imprint, Section);
    CLEAN_AND_COMPRESS_STRING_MEMBER(imprint, Part_sup);
    CLEAN_AND_COMPRESS_STRING_MEMBER(imprint, Language);
    CLEAN_AND_COMPRESS_STRING_MEMBER(imprint, Part_supi);
}

typedef pair<string, CRef<CPub> >   TCit;
struct TSortCit {
    bool operator ()(const TCit& c1, const TCit& c2) {

        // First, try to compare case-insensitively
        // (We compare as if it were all-caps to match C's behavior )
        const int label_compare_no_case =  s_CompareNoCaseCStyle(c1.first, c2.first);
        if( label_compare_no_case != 0 ) {
            return (label_compare_no_case < 0);
        }

        // if they're the same, try to compare case-sensitively
        const int label_compare_case = NStr::CompareCase( c1.first, c2.first );
        if( label_compare_case != 0 ) {
            return (label_compare_case < 0);
        }

        // if they're still the same, fall back on cit-gen titles, if possible
        return CitGenTitlesLess(*c1.second, *c2.second);
    }
    bool CitGenTitlesLess(const CPub& p1, const CPub& p2) {
        if ( ! p1.IsGen()  || ! p2.IsGen() ) {
            return false;
        }
        const CCit_gen& g1 = p1.GetGen();
        const CCit_gen& g2 = p2.GetGen();
        if ( g1.IsSetTitle() != g2.IsSetTitle() ) {
            return (g1.IsSetTitle() - g2.IsSetTitle());
        } else if( ! g1.IsSetTitle() && ! g2.IsSetTitle() ) {
            return false;
        }
        return g1.GetTitle() < g2.GetTitle();
    }
};

static
bool cmpSortedvsOld(const TCit& e1, const CRef<CPub>& e2) {
    return e1.second == e2;
}

void CNewCleanup_imp::PubSetBC( CPub_set &pub_set )
{
    // The Pub-set should always be pub. Ignore if not.
    if( ! FIELD_IS( pub_set, Pub ) ) {
        return;
    }

    // sort and unique by putting everything into a set
    // indexed by a label generated for each CPub.
    typedef set<TCit, TSortCit> TCitSet;
    TCitSet cit_set;
    ITERATE (CPub_set::TPub, cit_it, pub_set.GetPub()) {
        string label;
        (*cit_it)->GetLabel(&label, CPub::eContent, false);
        // the following line may fail due to dups 
        // (that's okay; it lets us automatically remove dups)
        cit_set.insert( TCit(label, *cit_it) );
    }
    // Has anything been deleted, or has the order changed?
    if ( cit_set.size() != pub_set.SetPub().size() ||
        ! equal(cit_set.begin(), cit_set.end(), pub_set.SetPub().begin(), cmpSortedvsOld) ) 
    {
        // put everything left back into the feature's citation list.
        pub_set.SetPub().clear();
        ITERATE (TCitSet, citset_it, cit_set) {
            pub_set.SetPub().push_back(citset_it->second);
        }
        ChangeMade(CCleanupChange::eCleanCitonFeat);
    }
}

void CNewCleanup_imp::ImpFeatBC( CSeq_feat& feat )
{
    if( ! FIELD_IS_SET_AND_IS(feat, Data, Imp) ) {
        return;
    }

    CImp_feat &imf = GET_MUTABLE( feat.SetData(), Imp );

    CLEAN_STRING_MEMBER_JUNK(imf, Key);
    CLEAN_STRING_MEMBER(imf, Loc);
    CLEAN_STRING_MEMBER(imf, Descr);
    
    if ( FIELD_IS_SET(imf, Key) ) {
        const CImp_feat::TKey& key = GET_FIELD(imf, Key);
        if (key == "allele"  ||  key == "mutation") {
            SET_FIELD(imf, Key, "variation");
            ChangeMade(CCleanupChange::eChangeKeywords);
        } else if ( key == "Import" || key == "virion" ) {
            SET_FIELD(imf, Key, "misc_feature");
            ChangeMade(CCleanupChange::eChangeKeywords);
        } else if ( key == "repeat_unit" ) {
            SET_FIELD(imf, Key, "repeat_region");
            ChangeMade(CCleanupChange::eChangeKeywords);
        } else if ( key == "misc_bind" ) {
            SET_FIELD(imf, Key, "misc_binding");
            ChangeMade(CCleanupChange::eChangeKeywords);
        } else if ( key == "satellite" && ! m_SeqEntryInfoStack.top().m_IsEmblOrDdbj ) {
            SET_FIELD(imf, Key, "repeat_region");
            ChangeMade(CCleanupChange::eChangeKeywords);

            CRef<CGb_qual> satellite_qual( new CGb_qual );
            satellite_qual->SetQual("satellite");
            string val;
            if( FIELD_IS_SET(feat, Comment) ) {
                val = x_ExtractSatelliteFromComment( GET_MUTABLE(feat, Comment) );
            }
            if( val.empty() ) {
                val = "satellite";
            }
            satellite_qual->SetVal( val );

            feat.SetQual().push_back( satellite_qual );
        }

        if( key == "repeat_region" && ! m_SeqEntryInfoStack.top().m_IsEmblOrDdbj ) {
            string val;
            if( FIELD_IS_SET(feat, Comment) ) {
                val = x_ExtractSatelliteFromComment( GET_MUTABLE(feat, Comment) );
            }
            if( ! val.empty() ) {
                CRef<CGb_qual> satellite_qual( new CGb_qual );
                satellite_qual->SetQual("satellite");
                satellite_qual->SetVal( val );

                feat.SetQual().push_back( satellite_qual );
                ChangeMade(CCleanupChange::eChangeKeywords);
            }
        }

        if( key == "CDS" ) {
            if( ! m_SeqEntryInfoStack.top().m_IsEmblOrDdbj ) {
                CRef<CCdregion> new_cdregion( new CCdregion );
                // get frame from location
                if( ! FIELD_EQUALS( feat, Pseudo, true ) && FIELD_IS_SET(feat, Location) ) {
                    x_SetFrameFromLoc( *new_cdregion, GET_FIELD(feat, Location) );
                }
                feat.SetData().SetCdregion( *new_cdregion );
                ChangeMade(CCleanupChange::eChangeKeywords);

                CdregionFeatBC( *new_cdregion, feat );
                return;
            }
        }
    }

    if( FIELD_IS_SET(imf, Loc) ) {
        if ( NStr::Find(imf.GetLoc(), "replace") != NPOS ) {
            x_AddReplaceQual(feat, imf.GetLoc());
            RESET_FIELD(imf, Loc);
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    } else if( FIELD_IS_SET(imf, Key) ) {
        const string &key = GET_FIELD(imf, Key);

        TRNAREF_TYPE rna_ref_type = NCBI_RNAREF(unknown);
        if ( key == "precursor_RNA" ) {
            rna_ref_type = NCBI_RNAREF(premsg);
        } else if ( key == "mRNA" ) {
            rna_ref_type = NCBI_RNAREF(mRNA);
        } else if ( key == "tRNA" ) {
            rna_ref_type = NCBI_RNAREF(tRNA);
        } else if ( key == "rRNA" ) {
            rna_ref_type = NCBI_RNAREF(rRNA);
        } else if ( key == "snRNA" ) {
            rna_ref_type = NCBI_RNAREF(snRNA);
        } else if ( key == "scRNA" ) {
            rna_ref_type = NCBI_RNAREF(scRNA);
        } else if ( key == "snoRNA" ) {
            rna_ref_type = NCBI_RNAREF(snoRNA);
        } else if ( key == "misc_RNA" ) {
            rna_ref_type = NCBI_RNAREF(other);
        }
        if (rna_ref_type != NCBI_RNAREF(unknown) ) {
            CRef<CRNA_ref> new_rna_ref( new CRNA_ref );
            new_rna_ref->SetType( rna_ref_type );
            feat.SetData().SetRna( *new_rna_ref );
            ChangeMade(CCleanupChange::eAddRNAref);

            // autogenerated code won't traverse this.
            // Also we create a NEW CAutogeneratedCleanup because
            // CAutogeneratedCleanup  is stateful and we don't
            // want to interfere with its state.
            CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
            auto_cleanup.BasicCleanupSeqFeat( feat );
        } else {
            TPROTREF_PROCESSED processed = NCBI_PROTREF(not_set);
            if ( key == "proprotein" ||  key == "preprotein" ) {
                processed = NCBI_PROTREF(preprotein);
            } else if ( key == "mat_peptide" ) {
                processed = NCBI_PROTREF(mature);
            } else if ( key == "sig_peptide" ) {
                processed = NCBI_PROTREF(signal_peptide);
            } else if ( key == "transit_peptide" ) {
                processed = NCBI_PROTREF(transit_peptide);
            }
            if (processed != NCBI_PROTREF(not_set) || key == "Protein" ) {
                const CSeq_id* location_seq_id = ( feat.IsSetLocation() ? feat.GetLocation().GetId() : NULL );
                if( location_seq_id ) {
                    CBioseq_Handle bioseq_handle = m_Scope->GetBioseqHandle(*location_seq_id);
                    if ( bioseq_handle && bioseq_handle.IsAa() ) {
                        CRef<CProt_ref> new_prot_ref( new CProt_ref );
                        new_prot_ref->SetProcessed( processed );
                        feat.SetData().SetProt( *new_prot_ref );
                        ChangeMade(CCleanupChange::eAddProtFeat);

                        // autogenerated code won't traverse this.
                        // Also we create a NEW CAutogeneratedCleanup because
                        // CAutogeneratedCleanup  is stateful and we don't
                        // want to interfere with its state.
                        CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
                        auto_cleanup.BasicCleanupSeqFeat( feat );
                    }
                }
            }
        }
    }
}


typedef SStaticPair<const char*, CSeqFeatData::TSite>  TSiteElem;
static const TSiteElem sc_site_map[] = {
    { "acetylation", CSeqFeatData::eSite_acetylation },
    { "active", CSeqFeatData::eSite_active },
    { "amidation", CSeqFeatData::eSite_amidation },
    { "binding", CSeqFeatData::eSite_binding },
    { "blocked", CSeqFeatData::eSite_blocked },
    { "cleavage", CSeqFeatData::eSite_cleavage },
    { "dna binding", CSeqFeatData::eSite_dna_binding },
    { "dna-binding", CSeqFeatData::eSite_dna_binding },
    { "gamma carboxyglutamic acid", CSeqFeatData::eSite_gamma_carboxyglutamic_acid },
    { "gamma-carboxyglutamic-acid", CSeqFeatData::eSite_gamma_carboxyglutamic_acid },
    { "glycosylation", CSeqFeatData::eSite_glycosylation },
    { "hydroxylation", CSeqFeatData::eSite_hydroxylation },
    { "inhibit", CSeqFeatData::eSite_inhibit },
    { "lipid binding", CSeqFeatData::eSite_lipid_binding },
    { "lipid-binding", CSeqFeatData::eSite_lipid_binding },
    { "metal binding", CSeqFeatData::eSite_metal_binding },
    { "metal-binding", CSeqFeatData::eSite_metal_binding },
    { "methylation", CSeqFeatData::eSite_methylation },
    { "modifi", CSeqFeatData::eSite_modified },
    { "mutagenized", CSeqFeatData::eSite_mutagenized },
    { "myristoylation", CSeqFeatData::eSite_myristoylation },
    { "nitrosylation", CSeqFeatData::eSite_nitrosylation },
    { "np binding", CSeqFeatData::eSite_np_binding },
    { "np-binding", CSeqFeatData::eSite_np_binding },
    { "oxidative deamination", CSeqFeatData::eSite_oxidative_deamination },
    { "oxidative-deamination", CSeqFeatData::eSite_oxidative_deamination },
    { "phosphorylation", CSeqFeatData::eSite_phosphorylation },
    { "pyrrolidone carboxylic acid", CSeqFeatData::eSite_pyrrolidone_carboxylic_acid },
    { "pyrrolidone-carboxylic-acid", CSeqFeatData::eSite_pyrrolidone_carboxylic_acid },
    { "signal peptide", CSeqFeatData::eSite_signal_peptide },
    { "signal-peptide", CSeqFeatData::eSite_signal_peptide },
    { "sulfatation", CSeqFeatData::eSite_sulfatation },
    { "transit peptide", CSeqFeatData::eSite_transit_peptide },
    { "transit-peptide", CSeqFeatData::eSite_transit_peptide },
    { "transmembrane region", CSeqFeatData::eSite_transmembrane_region },
    { "transmembrane-region", CSeqFeatData::eSite_transmembrane_region }
};
typedef CStaticArrayMap<string, CSeqFeatData::TSite, PNocase> TSiteMap;
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TSiteMap, sc_SiteMap, sc_site_map);

void CNewCleanup_imp::SiteFeatBC( CSeqFeatData::ESite &site, CSeq_feat& feat )
{
    // If site set to "other", try to extract it from the comment
    if ( FIELD_IS_SET(feat, Comment)  &&
        (site == CSeqFeatData::TSite(0)  ||  site == CSeqFeatData::eSite_other)) 
    {
        // extract if comment starts with any informative possibilities listed in sc_SiteMap
        const string& comment = GET_FIELD(feat, Comment);
        TSiteMap::const_iterator it = s_FindInMapAsPrefix<TSiteMap>( comment, sc_SiteMap );
        if ( it != sc_SiteMap.end() ) {
            feat.SetData().SetSite(it->second);
            ChangeMade(CCleanupChange::eChangeSite);
            // erase the comment if it contains no further useful info aside from the site
            if (NStr::IsBlank(comment, it->first.length())  ||
                NStr::EqualNocase(comment, it->first.length(), NPOS, " site")) {
                    feat.ResetComment();
                    ChangeMade(CCleanupChange::eChangeComment);
            }
        }
    }
}

void CNewCleanup_imp::SeqLocBC( CSeq_loc &loc )
{
    switch (loc.Which()) {
    case CSeq_loc::e_Int :
        x_SeqIntervalBC( GET_MUTABLE(loc, Int) );
        break;
    case CSeq_loc::e_Packed_int :
        {
            CSeq_loc::TPacked_int::Tdata& ints = loc.SetPacked_int().Set();
            NON_CONST_ITERATE(CSeq_loc::TPacked_int::Tdata, interval_it, ints) {
                x_SeqIntervalBC(**interval_it);
            }
            if (ints.size() == 1) {
                CRef<CSeq_interval> int_ref = ints.front();
                loc.SetInt(*int_ref);
                ChangeMade(CCleanupChange::eChangeSeqloc);
            }
        }
        break;
    case CSeq_loc::e_Pnt :
        {
            CSeq_loc::TPnt& pnt = loc.SetPnt();
            
            // change both and both-rev to plus and minus, respectively
            if (pnt.CanGetStrand()) {
                ENa_strand strand = pnt.GetStrand();
                if (strand == eNa_strand_both) {
                    pnt.SetStrand(eNa_strand_plus);
                    ChangeMade(CCleanupChange::eChangeStrand);
                } else if (strand == eNa_strand_both_rev) {
                    pnt.SetStrand(eNa_strand_minus);
                    ChangeMade(CCleanupChange::eChangeStrand);
                } else if( strand == eNa_strand_unknown ) {
                    pnt.ResetStrand();
                    ChangeMade(CCleanupChange::eChangeStrand);
                }
            }

            // normalize Seq-point fuzz tl to tr and decrement position
            if (pnt.IsSetFuzz() && pnt.GetFuzz().IsLim() &&
                pnt.GetFuzz().GetLim() == CInt_fuzz::eLim_tl) {
                TSeqPos pos = pnt.GetPoint();
                if (pos > 0) {
                    pnt.SetFuzz().SetLim(CInt_fuzz::eLim_tr);
                    pnt.SetPoint(pos - 1);
                    ChangeMade(CCleanupChange::eChangeSeqloc);
                }
            }
        }
        break;
    case CSeq_loc::e_Mix :
        {
            typedef CSeq_loc::TMix::Tdata TMixList;
            // delete Null type Seq-locs from beginning and end of Mix list.

            // deleting from beginning:
            TMixList& sl_list = loc.SetMix().Set();
            TMixList::iterator sl_it = sl_list.begin();
            while (sl_it != sl_list.end()) {
                if ((*sl_it)->IsNull()) {
                    sl_it = sl_list.erase(sl_it);
                    ChangeMade(CCleanupChange::eChangeSeqloc);
                } else {
                    break;
                }
            }

            // deleting from end:
            if( sl_list.size() > 0 ) {
                sl_it = sl_list.end();
                while (sl_it != sl_list.begin()) {
                    --sl_it;
                    if ( ! (*sl_it)->IsNull()) {
                        break;
                    }
                }
                ++sl_it;
                if (sl_it != sl_list.end()) {
                    sl_list.erase(sl_it, sl_list.end());
                    ChangeMade(CCleanupChange::eChangeSeqloc);            
                }
            }

            if (sl_list.size() == 0) {
                loc.SetNull();
                ChangeMade(CCleanupChange::eChangeSeqloc);
            } else if (sl_list.size() == 1) {
                CRef<CSeq_loc> only_sl = sl_list.front();
                loc.Assign(*only_sl);
                ChangeMade(CCleanupChange::eChangeSeqloc);
            }
        }
        break;
    default:
        break;
    }

    // don't allow strandedness on protein sequences
    {
        CBioseq_Handle bsh;
        if (m_Scope) {
            ITERATE( CSeq_loc, loc_ci, loc ) {
                bsh = m_Scope->GetBioseqHandle(loc_ci.GetSeq_id());
                if( bsh ) {
                    break;
                }
            }
        }
        if ( bsh && bsh.IsProtein() && FIELD_IS_SET(loc, Strand) ) { 
            RESET_FIELD(loc, Strand);
            ChangeMade(CCleanupChange::eChangeStrand);
        }
    }
}

void CNewCleanup_imp::ConvertSeqLocWholeToInt( CSeq_loc &loc )
{
    if (loc.IsWhole()  &&  m_Scope) {

        // change the Seq-loc/whole to a Seq-loc/interval which covers the whole sequence.
        CRef<CSeq_id> id(new CSeq_id());
        id->Assign(loc.GetWhole());
        CBioseq_Handle bsh;

        if( id ) {
            bsh = m_Scope->GetBioseqHandle(*id);
        }
        if (bsh) {
            TSeqPos bs_len = bsh.GetBioseqLength();
            
            loc.SetInt().SetId(*id);
            loc.SetInt().SetFrom(0);
            loc.SetInt().SetTo(bs_len - 1);
            ChangeMade(CCleanupChange::eChangeWholeLocation);
        }
    }
}

static void 
s_AddSeqLocMix( CSeq_loc_mix::Tdata & new_mix_pieces, 
               CSeq_loc_mix::Tdata & mix_pieces, 
               bool any_nulls_seen )
{
    NON_CONST_ITERATE( CSeq_loc_mix::Tdata, old_mix_iter, mix_pieces ) {
        CRef<CSeq_loc> old_piece( *old_mix_iter );
        if( old_piece->IsNull() ) {
            // ignore
        } else if( old_piece->IsMix() ) {
            s_AddSeqLocMix( new_mix_pieces, old_piece->SetMix(), 
                any_nulls_seen );
        } else {
            if( any_nulls_seen && ! new_mix_pieces.empty() ) {
                CRef<CSeq_loc> null_piece( new CSeq_loc );
                null_piece->SetNull();
                new_mix_pieces.push_back( null_piece );
            }
            new_mix_pieces.push_back( old_piece );
        }
    }
}

void CNewCleanup_imp::SeqLocMixBC( CSeq_loc_mix & loc_mix )
{
    if( ! loc_mix.IsSet() || loc_mix.Set().empty() ) {
        return;
    }

    // This function does two things simultaneously:
    // It checks for mix-inside-mix and also checks if 
    // we need to do "NULL-normalization"
    bool have_seen_inner_mix = false;
    bool any_nulls_seen = false;
    bool alternates_not_null_then_null = true;

    CSeq_loc_mix::Tdata & mix_pieces = loc_mix.Set();
    if( (mix_pieces.size() % 2) == 0 ) {
        // can't do notnull-null-notnull-null-notnull-....-null-notnull
        // if we have an even number of items
        alternates_not_null_then_null = false;
    }

    bool last_piece_was_null = true;
    ITERATE( CSeq_loc_mix::Tdata, outer_mix_iter, mix_pieces ) {
        const CSeq_loc &this_piece = **outer_mix_iter;
        const bool this_piece_is_null = this_piece.IsNull();

        // see if we've found any NULLs in this loc
        if( this_piece_is_null ) {
            any_nulls_seen = true;
        }

        // see if we break alternation of notnull and null
        if( alternates_not_null_then_null ) {
            if( this_piece_is_null == last_piece_was_null ) {
                // two of the same kind in a row: does not alternate
                alternates_not_null_then_null = false;
            }
        }

        // see if there's a nested mix in here
        if( this_piece.IsMix() ) {
            have_seen_inner_mix = true;
            alternates_not_null_then_null = false; // mix breaks alternation
            // We have to check if the inner-mix contains any NULLs
            if( ! any_nulls_seen ) {
                CSeq_loc_CI inner_ci( this_piece, CSeq_loc_CI::eEmpty_Allow );
                for( ; inner_ci; ++inner_ci ) {
                    if( inner_ci.IsEmpty() ) {
                        any_nulls_seen = true;
                    }
                }
            }
        }

        // for next iteration
        last_piece_was_null = this_piece_is_null;
    }

    // we've examined the location, so if there are any problems, we have
    // to rebuild it.
    if( have_seen_inner_mix || 
        (any_nulls_seen && ! alternates_not_null_then_null) ) 
    {
        CSeq_loc_mix new_mix;
        CSeq_loc_mix::Tdata & new_mix_pieces = new_mix.Set();

        // has to be in a separate function because it's recursive
        s_AddSeqLocMix( new_mix_pieces, mix_pieces, any_nulls_seen );

        // swap is faster than assignment
        loc_mix.Set().swap( new_mix_pieces );
    }
}

static bool s_IsJustQuotes (const string& str)

{
    FOR_EACH_CHAR_IN_STRING (str_itr, str) {
        const char& ch = *str_itr;
        if (ch > ' ' && ch != '"' && ch != '\'') return false;
    }
    return true;
}

void CNewCleanup_imp::GBQualBC (
    CGb_qual& gbq
)

{
    CLEAN_STRING_MEMBER (gbq, Qual);
    if (! FIELD_IS_SET (gbq, Qual)) {
        SET_FIELD (gbq, Qual, kEmptyStr);
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }

    CLEAN_STRING_MEMBER (gbq, Val);
    if (FIELD_IS_SET (gbq, Val) && s_IsJustQuotes (GET_FIELD (gbq, Val))) {
        SET_FIELD (gbq, Val, kEmptyStr);
        ChangeMade (CCleanupChange::eCleanDoubleQuotes);
    }
    if (! FIELD_IS_SET (gbq, Val)) {
        SET_FIELD (gbq, Val, kEmptyStr);
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }

    _ASSERT (FIELD_IS_SET (gbq, Qual) && FIELD_IS_SET (gbq, Val));

    if (NStr::EqualNocase(GET_FIELD(gbq, Qual), "rpt_unit")  ||
               NStr::EqualNocase(GET_FIELD(gbq, Qual), "rpt_unit_range")  ||
               NStr::EqualNocase(GET_FIELD(gbq, Qual), "rpt_unit_seq")) {
        bool range_qual = x_CleanupRptUnit(gbq);
        if (NStr::EqualNocase(GET_FIELD(gbq, Qual), "rpt_unit")) {
            if (range_qual) {
                SET_FIELD( gbq, Qual, "rpt_unit_range" );
            } else {
                SET_FIELD( gbq, Qual, "rpt_unit_seq" );
            }
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
        if( NStr::EqualNocase(GET_FIELD(gbq, Qual), "rpt_unit_seq") ) {
            string val_no_u = NStr::Replace( GET_FIELD(gbq, Val), "u", "t");
            if (val_no_u != GET_FIELD(gbq, Val)) {
                SET_FIELD( gbq, Val, val_no_u );
                ChangeMade(CCleanupChange::eChangeQualifiers);
            }
        }
    }
    x_ChangeTransposonToMobileElement(gbq);
    x_ChangeInsertionSeqToMobileElement(gbq);

    if (NStr::EqualNocase(GET_FIELD(gbq, Qual), "mobile_element")) {
        SET_FIELD( gbq, Qual, "mobile_element_type" );
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
}

static 
const char *s_FindImpFeatType( const CImp_feat &imp )
{
    // keep sorted in ASCII-betical order
    static const char *allowed_types[] = { 
        "-10_signal",     "-35_signal",   "3'UTR",          "3'clip",       "5'UTR",          
        "5'clip",         "CAAT_signal",  "CDS",            "C_region",     "D-loop",         
        "D_segment",      "GC_signal",    "Import",         "J_segment",    "LTR",            
        "N_region",       "RBS",          "STS",            "S_region",     "Site-ref",       
        "TATA_signal",    "V_region",     "V_segment",      "allele",       "attenuator",     
        "centromere",     "conflict",     "enhancer",       "exon",         "gap",            
        "iDNA",           "intron",       "mat_peptide",    "misc_RNA",     "misc_binding",   
        "misc_difference","misc_feature", "misc_recomb",    "misc_signal",  "misc_structure", 
        "mobile_element", "modified_base","mutation",       "old_sequence", "operon",         
        "oriT",           "polyA_signal", "polyA_site",     "precursor_RNA","prim_transcript",
        "primer_bind",    "promoter",     "protein_bind",   "rep_origin",   "repeat_region",  
        "repeat_unit",    "satellite",    "sig_peptide",    "source",       "stem_loop",      
        "telomere",       "terminator",   "transit_peptide","unsure",       "variation",      
        "virion"
    };
    static const int kAllowedTypesNumElems = ( sizeof(allowed_types) / sizeof(allowed_types[0]));

    static const char *kFeatBad = "???";
    
    if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(imp, Key) ) {
        // the C logic is more complex than this
        const char *key = GET_FIELD(imp, Key).c_str();
        if( binary_search( allowed_types, allowed_types + kAllowedTypesNumElems,
            key, PCase_CStr() ) ) 
        {
            return key;
        }
    }

    return kFeatBad;
}

static 
const char *s_FindKeyFromFeatDefType( const CSeq_feat &feat )
{
    static const char *kFeatBad = "???";

    SWITCH_ON_SEQFEAT_CHOICE(feat) {
        case NCBI_SEQFEAT(Gene):
            return "Gene";
        case NCBI_SEQFEAT(Org):
            return "Org";
        case NCBI_SEQFEAT(Cdregion):
            return "CDS";
        case NCBI_SEQFEAT(Prot):
            if( feat.GetData().GetProt().CanGetProcessed() ) {
                switch( feat.GetData().GetProt().GetProcessed() ) {
                case NCBI_PROTREF(not_set):
                    return "Protein";
                case NCBI_PROTREF(preprotein):
                    return "proprotein";
                case NCBI_PROTREF(mature):
                    return "mat_peptide";
                case NCBI_PROTREF(signal_peptide):
                    return "sig_peptide";
                case NCBI_PROTREF(transit_peptide):
                    return "transit_peptide";
                default:
                    return kFeatBad;
                }
            }
            return "Protein";
        case NCBI_SEQFEAT(Rna):
            if( feat.GetData().GetRna().IsSetType() ) { 
                switch ( feat.GetData().GetRna().GetType() )
                {
                case NCBI_RNAREF(unknown):
                        return "misc_RNA"; // unknownrna mapped to otherrna
                case NCBI_RNAREF(premsg):
                    return "precursor_RNA";
                case NCBI_RNAREF(mRNA):
                    return "mRNA";
                case NCBI_RNAREF(tRNA):
                    return "tRNA";
                case NCBI_RNAREF(rRNA):
                    return "rRNA";
                case NCBI_RNAREF(snRNA):
                    return "snRNA";
                case NCBI_RNAREF(scRNA):
                    return "scRNA";
                case NCBI_RNAREF(snoRNA):
                    return "snoRNA";
                case NCBI_RNAREF(ncRNA):
                    return "ncRNA";
                case NCBI_RNAREF(tmRNA):
                    return "tmRNA";
                case NCBI_RNAREF(miscRNA):
                    return "misc_RNA";
                case NCBI_RNAREF(other):
                    if ( FIELD_IS_SET_AND_IS(feat.GetData().GetRna(), Ext, Name) ) { 
                        const string &name = feat.GetData().GetRna().GetExt().GetName();
                        if ( NStr::EqualNocase(name, "misc_RNA")) return "misc_RNA";
                        if ( NStr::EqualNocase(name, "ncRNA") ) return "ncRNA";
                        if ( NStr::EqualNocase(name, "tmRNA") ) return "tmRNA";
                    }
                    return "misc_RNA";
                default:
                    return kFeatBad;
                }
            }
            return kFeatBad;
        case NCBI_SEQFEAT(Pub):
            return "Cit";
        case NCBI_SEQFEAT(Seq):
            return "Xref";
        case NCBI_SEQFEAT(Imp):
            return s_FindImpFeatType( feat.GetData().GetImp() );
        case NCBI_SEQFEAT(Region):
            return "Region";
        case NCBI_SEQFEAT(Comment):
            return "Comment";
        case NCBI_SEQFEAT(Bond):
            return "Bond";
        case NCBI_SEQFEAT(Site):
            return "Site";
        case NCBI_SEQFEAT(Rsite):
            return "Rsite";
        case NCBI_SEQFEAT(User):
            return "User";
        case NCBI_SEQFEAT(Txinit):
            return "TxInit";
        case NCBI_SEQFEAT(Num):
            return "Num";
        case NCBI_SEQFEAT(Psec_str):
            return "SecStr";
        case NCBI_SEQFEAT(Non_std_residue):
            return "NonStdRes";
        case NCBI_SEQFEAT(Het):
            return "Het";
        case NCBI_SEQFEAT(Biosrc):
            return "Src";
        case NCBI_SEQFEAT(Clone):
            return "CloneRef";
        case NCBI_SEQFEAT(Variation):
            return "VariationRef";
        default:
            return kFeatBad;
    }
    return kFeatBad;
}

static
bool s_IsAllDigits( const string &str )
{
    CCachedRegexp all_digits_regex = regexpCache.Get("^[0-9]+$");
    return all_digits_regex->IsMatch(str);
}

CNewCleanup_imp::EAction CNewCleanup_imp::GBQualSeqFeatBC(CGb_qual& gb_qual, CSeq_feat& feat)
{
    if( ! FIELD_IS_SET(feat, Data) ) {
        return eAction_Nothing;
    }
    CSeqFeatData &data = GET_MUTABLE(feat, Data);

    string& qual = GET_MUTABLE(gb_qual, Qual);
    string& val  = GET_MUTABLE(gb_qual, Val);

    if( FIELD_EQUALS(feat, Pseudo, false) ) {
        RESET_FIELD(feat, Pseudo);
        ChangeMade (CCleanupChange::eChangeQualifiers);
    }

    if( FIELD_EQUALS(feat, Partial, false) ) {
        RESET_FIELD(feat, Partial);
        ChangeMade (CCleanupChange::eChangeQualifiers);
    }

    if (NStr::EqualNocase(qual, "cons_splice")) {
        return eAction_Erase;
    } else if (NStr::EqualNocase(qual, "replace")) {
        if ( FIELD_IS(data, Imp)  &&
             STRING_FIELD_MATCH_BUT_ONLY_CASE_INSENSITIVE( data.GetImp(), Key, "variation") )
        {
                NStr::ToLower(val);
                ChangeMade(CCleanupChange::eChangeQualifiers);
        }
        if ( ! NStr::IsBlank(val) &&  val.find_first_not_of("ACGTUacgtu") == NPOS) {
            const string original_val = val;
            NStr::ToLower(val);
            NStr::ReplaceInPlace(val, "u", "t");
            if (original_val != val) {
                ChangeMade(CCleanupChange::eChangeQualifiers);
            }
        }
    } else if (NStr::EqualNocase(qual, "partial")) {
        feat.SetPartial(true);
        ChangeMade(CCleanupChange::eChangeQualifiers);
        return eAction_Erase;  // mark qual for deletion
    } else if (NStr::EqualNocase(qual, "evidence")) {
        return eAction_Erase;  // mark qual for deletion
    } else if (NStr::EqualNocase(qual, "exception")) {
        if( ! FIELD_EQUALS(feat, Except, true ) ) {
            SET_FIELD(feat, Except, true);
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
        if (!NStr::IsBlank(val)  &&  !NStr::EqualNocase(val, "true")) {
            if (!feat.IsSetExcept_text()) {
                feat.SetExcept_text(val);
                ChangeMade(CCleanupChange::eChangeQualifiers);
            }
        }
        return eAction_Erase;  // mark qual for deletion
    } else if (NStr::EqualNocase(qual, "experiment")) {
        if (NStr::EqualNocase(val, "experimental evidence, no additional details recorded")) {
            ChangeMade(CCleanupChange::eChangeQualifiers);
            return eAction_Erase;  // mark qual for deletion
        }
    } else if (NStr::EqualNocase(qual, "inference")) {
        if (NStr::EqualNocase(val, "non-experimental evidence, no additional details recorded")) {
            ChangeMade(CCleanupChange::eChangeQualifiers);
            return eAction_Erase;  // mark qual for deletion
        } else {
            x_CleanupAndRepairInference(val);
        }
    } else if (NStr::EqualNocase(qual, "note")  ||
               NStr::EqualNocase(qual, "notes")  ||
               NStr::EqualNocase(qual, "comment")) {
        if (!feat.IsSetComment()) {
            feat.SetComment(val);
        } else {
            (feat.SetComment() += "; ") += val;
        }
        ChangeMade(CCleanupChange::eChangeComment);
        ChangeMade(CCleanupChange::eChangeQualifiers);
        return eAction_Erase;  // mark qual for deletion
    } else if( NStr::EqualNocase(qual, "label") ) {
        if ( NStr::EqualNocase(val, s_FindKeyFromFeatDefType(feat)) ) {
            // skip label that is simply the feature key
        } else if ( ! FIELD_IS_SET(feat, Comment) || NStr::FindNoCase(GET_FIELD(feat, Comment), "label") == NPOS) {
            // if label is not already in comment, append
            if( GET_STRING_FLD_OR_BLANK(feat, Comment).empty() ) {
                SET_FIELD(feat, Comment, "label: " + val );
            } else {
                GET_MUTABLE(feat, Comment) += "; label: " + val;
            }
            ChangeMade(CCleanupChange::eChangeComment);
        }
        return eAction_Erase;
    } else if (NStr::EqualNocase(qual, "db_xref")) {
        string tag, db;
        if (NStr::SplitInTwo(val, ":", db, tag)) {
            CRef<CDbtag> dbp(new CDbtag);
            dbp->SetDb(db);
            dbp->SetTag().SetStr(tag);
            feat.SetDbxref().push_back(dbp);
            ChangeMade(CCleanupChange::eChangeDbxrefs);
            return eAction_Erase;  // mark qual for deletion
        }
    } else if (NStr::EqualNocase(qual, "gdb_xref")) {
        CRef<CDbtag> dbp(new CDbtag);
        dbp->SetDb("GDB");
        dbp->SetTag().SetStr(val);
        feat.SetDbxref().push_back(dbp);
        ChangeMade(CCleanupChange::eChangeDbxrefs);
        return eAction_Erase;  // mark qual for deletion
    } else if ( NStr::EqualNocase(qual, "pseudo") ) {
        feat.SetPseudo(true);
        ChangeMade(CCleanupChange::eChangeQualifiers);
        return eAction_Erase;  // mark qual for deletion
    } else if ( NStr::EqualNocase(qual, "pseudogene") )
    {
        if( ! FIELD_EQUALS(feat, Pseudo, true) ) {
            feat.SetPseudo(true);
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }

        // lowercase pseudogene qual
        string new_val = val;
        NStr::ToLower(new_val);
        if( new_val != val ) {
            val = new_val;
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    } else if ( FIELD_IS(data, Gene)  &&  x_GeneGBQualBC( GET_MUTABLE(data, Gene), gb_qual) == eAction_Erase) {
        return eAction_Erase;  // mark qual for deletion
    } else if ( FIELD_IS(data, Cdregion)  &&  x_SeqFeatCDSGBQualBC(feat, GET_MUTABLE(data, Cdregion), gb_qual) == eAction_Erase ) {
        return eAction_Erase;  // mark qual for deletion
    } else if (data.IsRna()  &&  x_SeqFeatRnaGBQualBC(feat, data.SetRna(), gb_qual) == eAction_Erase) {
        return eAction_Erase;  // mark qual for deletion
    } else if (data.IsProt()  &&  x_ProtGBQualBC(data.SetProt(), gb_qual, eGBQualOpt_normal) == eAction_Erase) {
        return eAction_Erase;  // mark qual for deletion
    } else if (NStr::EqualNocase(qual, "gene")) {
        if (!NStr::IsBlank(val)) {
            CRef<CSeqFeatXref> xref(new CSeqFeatXref);
            xref->SetData().SetGene().SetLocus(val);
            feat.SetXref().push_back(xref);
            ChangeMade(CCleanupChange::eCopyGeneXref);
            return eAction_Erase;  // mark qual for deletion
        }
    } else if (NStr::EqualNocase(qual, "codon_start")) {
        if (!data.IsCdregion()) {
            // not legal on anything but CDS, so remove it
            return eAction_Erase;  // mark qual for deletion
        }
    } else if ( NStr::EqualNocase(qual, "EC_number") ) {
        x_CleanupECNumber(val);
    } else if( qual == "satellite" ) {
        x_MendSatelliteQualifier( val );
    }

    if( NStr::EqualNocase( qual, "mobile_element_type" ) ) {
        // trim spaces around first colon but only if there are no colons
        // with spaces before and after
        if( NPOS != NStr::Find(val, " :") || NPOS != NStr::Find(val, ": ") ) {
            if( s_RegexpReplace( val, "[ ]*:[ ]*", ":", 1 ) ) {
                ChangeMade(CCleanupChange::eCleanQualifiers);
            }
        }

        if( data.IsImp() && STRING_FIELD_MATCH( data.GetImp(), Key, "repeat_region" ) && ! val.empty() ) {
            qual = "mobile_element_type";
            data.SetImp().SetKey( "mobile_element" );
            ChangeMade(CCleanupChange::eCleanQualifiers);
        }
    }

    // estimated_length must be a number or "unknown"
    if( NStr::EqualNocase( qual, "estimated_length" ) ) {
        if( ! s_IsAllDigits(val) && ! NStr::EqualNocase(val, "unknown") ) {
            val = "unknown";
            ChangeMade(CCleanupChange::eCleanQualifiers);
        }
    }

    // conflict is obsolete.  Make it misc_difference, but add a note 
    // to the feature comment as to what it used to be.
    if( data.IsImp() && STRING_FIELD_MATCH( data.GetImp(), Key, "conflict" ) ) {
        data.SetImp().SetKey( "misc_difference");
        if( feat.IsSetComment() ) {
            GET_MUTABLE(feat, Comment) = "conflict; " + GET_FIELD(feat, Comment);
        } else {
            SET_FIELD(feat, Comment, "conflict");
        }
        ChangeMade(CCleanupChange::eCleanQualifiers);
    }

    if( qual.empty() && val.empty() ) {
        return eAction_Erase;
    }

    return eAction_Nothing;
}

// This code is not used since we now erase cons_splice quals,
// but I'm leaving it here in case we change our minds later.
void CNewCleanup_imp::x_CleanupConsSplice(CGb_qual& gbq)

{
    string& val = GET_MUTABLE(gbq, Val);
    
    if (!NStr::StartsWith(val, "(5'site:")) {
        return;
    }
    
    size_t pos = val.find(",3'site:");
    if (pos != NPOS) {
        val.insert(pos + 1, " ");
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
}

static
bool s_HasUpper (const string &val)
{
    FOR_EACH_CHAR_IN_STRING( str_itr, val ) {
        if( isupper(*str_itr) ) {
            return true;
        }
    }
    return false;
}

// return true if the val indicates this a range qualifier "[0-9]+..[0-9]+"

bool CNewCleanup_imp::x_CleanupRptUnit(CGb_qual& gbq)
{
    CGb_qual::TVal& val = GET_MUTABLE(gbq, Val);
    
    if (NStr::IsBlank(val)) {
        return false;
    }
    if( string::npos != val.find_first_not_of("ACGTUNacgtun0123456789()") ) {
        if (s_HasUpper(val)) {
            val = NStr::ToLower(val);
            ChangeMade(CCleanupChange::eChangeQualifiers);
            return false;
        }
    } 
    bool digits1 = false, sep = false, digits2 = false;
    string cleaned_val;
    string::const_iterator it = val.begin();
    string::const_iterator end = val.end();
    while (it != end) {
        while (it != end  &&  (*it == '('  ||  *it == ')'  ||  *it == ',')) {
            cleaned_val += *it++;
        }
        while (it != end  &&  isspace((unsigned char)(*it))) {
            ++it;
        }
        while (it != end  &&  isdigit((unsigned char)(*it))) {
            cleaned_val += *it++;
            digits1 = true;
        }
        if (it != end  &&  (*it == '.'  ||  *it == '-')) {
            while (it != end  &&  (*it == '.'  ||  *it == '-')) {
                ++it;
            }
            cleaned_val += "..";
            sep = true;
        }
        while (it != end  &&  isspace((unsigned char)(*it))) {
            ++it;
        }
        while (it != end  &&  isdigit((unsigned char)(*it))) {
            cleaned_val += *it++;
            digits2 = true;
        }
        while (it != end  &&  isspace((unsigned char)(*it))) {
            ++it;
        }
        if (it != end) {
            char c = *it;
            if (c != '('  &&  c != ')'  &&  c != ','  &&  c != '.'  &&
                !isspace((unsigned char) c)  &&  !isdigit((unsigned char) c)) {
                if (s_HasUpper(val)) {
                    val = NStr::ToLower(val);
                    ChangeMade(CCleanupChange::eChangeQualifiers);
                }
                return false;
            }
        }
    }
    if (val != cleaned_val) {
        val = cleaned_val;
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
    
    return  (digits1 && sep && digits2);
}

void CNewCleanup_imp::x_ChangeTransposonToMobileElement(CGb_qual& gbq)
//
//  As of Dec 2006, "transposon" is no longer legal as a qualifier. The replacement
//  qualifier is "mobile_element". In addition, the value has to be massaged to
//  indicate "integron" or "transposon".
//
{
    static const string integronValues[] = {
        "class I integron",
        "class II integron",
        "class III integron",
        "class 1 integron",
        "class 2 integron",
        "class 3 integron"
    };
    static const string* endIntegronValues 
        = integronValues + sizeof(integronValues)/sizeof(*integronValues);

    if (NStr::EqualNocase( GET_FIELD(gbq, Qual), "transposon")) {
        SET_FIELD( gbq, Qual, "mobile_element");

        // If the value is one of the IntegronValues, change it to "integron: class XXX":
        const string* pValue = std::find(integronValues, endIntegronValues, GET_FIELD(gbq, Val) );
        if ( pValue != endIntegronValues ) {
            string::size_type cutoff = pValue->find( " integron" );
            _ASSERT( cutoff != string::npos ); // typo in IntegronValues?
            SET_FIELD( gbq, Val, string("integron: ") + pValue->substr(0, cutoff) );
        }
        // Otherwise, just prefix it with "transposon: ":
        else {
            SET_FIELD( gbq, Val, string("transposon: ") + GET_FIELD(gbq, Val) );
        }
        
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
}

void CNewCleanup_imp::x_ChangeInsertionSeqToMobileElement(CGb_qual& gbq)
//
//  As of Dec 2006, "insertion_seq" is no longer legal as a qualifier. The replacement
//  qualifier is "mobile_element". In addition, the value has to be massaged to
//  reflect the "insertion_seq".
//
{
    if (NStr::EqualNocase( GET_FIELD(gbq, Qual), "insertion_seq")) {
        gbq.SetQual("mobile_element");
        gbq.SetVal( string("insertion sequence:") + GET_FIELD(gbq, Val) );
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
}

static bool s_IsCompoundRptTypeValue( 
    const string& value )
//
//  Format of compound rpt_type values: (value[,value]*)
//
//  These are internal to sequin and are in theory cleaned up before the material
//  is released. However, some compound values have escaped into the wild and have 
//  not been retro-fixed yet (as of 2006-03-17).
//
{
    if( value.length() < 3 ) {
        return false;
    }

    return ( NStr::StartsWith( value, '(' ) && NStr::EndsWith( value, ')' ) ) ||
           ( NStr::StartsWith( value, '{' ) && NStr::EndsWith( value, '}' ) );
};

static
void s_ExpandThisQual( 
    CSeq_feat::TQual& quals,        // the list of CGb_qual's.
    CSeq_feat::TQual::iterator& it, // points to the one qual we might expand.
    CSeq_feat::TQual& new_quals )    // new quals that will need to be inserted
//
//  Rules for "rpt_type" qualifiers (as of 2006-03-07):
//
//  There can be multiple occurrences of this qualifier, and we need to keep them 
//  all.
//  The value of this qualifier can also be a *list of values* which is *not* 
//  conforming to the ASN.1 and thus needs to be cleaned up. 
//
//  The cleanup entails turning the list of values into multiple occurrences of the 
//  given qualifier, each occurrence taking one of the values in the original 
//  list.
//
{
    CGb_qual& qual = **it;
    string  qual_type = qual.GetQual();
    string& val = qual.SetVal();
    if ( ! s_IsCompoundRptTypeValue( val ) ) {
        //
        //  nothing to do ...
        //
        return;
    }

    //
    //  Generate list of cleaned up values. Fix original qualifier and generate 
    //  list of new qualifiers to be added to the original list:
    //    
    vector< string > newValues;
    string valueList = val.substr(1, val.length() - 2);
    NStr::Tokenize(valueList, ",", newValues);
    
    qual.SetVal( newValues[0] );
   
    for ( size_t i=1; i < newValues.size(); ++i ) {
        CRef< CGb_qual > newQual( new CGb_qual() );
        newQual->SetQual( qual_type );
        newQual->SetVal( newValues[i] );
        new_quals.push_back( newQual ); 
    }
}

void CNewCleanup_imp::x_ExpandCombinedQuals(CSeq_feat::TQual& quals)
{
    CSeq_feat::TQual    new_quals;
    NON_CONST_ITERATE (CSeq_feat::TQual, it, quals) {
        CGb_qual& gb_qual = **it;

        string& qual = GET_MUTABLE(gb_qual, Qual);
        string& val  = GET_MUTABLE(gb_qual, Val);

        // convert curly braces to parens for some quals
        if( (val.length() > 1) && (val[0] == '{') &&
            (val[val.length()-1] == '}') ) 
        {
            val[0] = '(';
            val[val.length()-1] = ')';
            ChangeMade(CCleanupChange::eCleanQualifiers);
        }
        
        if (NStr::EqualNocase(qual, "rpt_type")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "rpt_unit")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "rpt_unit_range")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "rpt_unit_seq")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "usedin")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "old_locus_tag")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "compare")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "replace")) {
            s_ExpandThisQual( quals, it, new_quals );
        }
    }
    
    if ( ! new_quals.empty() ) {
        quals.insert(quals.end(), new_quals.begin(), new_quals.end());
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
}

CNewCleanup_imp::EAction 
CNewCleanup_imp::x_GeneGBQualBC( CGene_ref& gene, const CGb_qual& gb_qual )
{
    const string& qual = GET_FIELD(gb_qual, Qual);
    const string& val  = GET_FIELD(gb_qual, Val);

    if( NStr::IsBlank(val) ) {
        return eAction_Nothing;
    }

    bool change_made = false;
    if (NStr::EqualNocase(qual, "map")) {
        if (! gene.IsSetMaploc() ) {
            change_made = true;
            gene.SetMaploc(val);
        }
    } else if (NStr::EqualNocase(qual, "allele")) {
        if ( gene.IsSetAllele() ) {
            return ( NStr::EqualNocase(val, gene.GetAllele()) ? eAction_Erase : eAction_Nothing );
        } else {
            change_made = true;
            gene.SetAllele(val);
        }
    } else if (NStr::EqualNocase(qual, "locus_tag")) {
        if ( ! gene.IsSetLocus_tag() ) {
            change_made = true;
            gene.SetLocus_tag(val);
        }
    } else if (NStr::EqualNocase(qual, "gene_synonym")) {
        change_made = true;
        gene.SetSyn().push_back(val);
    } else if (NStr::EqualNocase(qual, "gene") ) {
        change_made = true;
        if ( ! gene.IsSetLocus() ) {
            gene.SetLocus(val);
        } else if (gene.GetLocus() != val) {
            CGene_ref::TSyn::const_iterator syn_it = 
                find(gene.GetSyn().begin(), gene.GetSyn().end(), val);
            if (syn_it == gene.GetSyn().end()) {
                gene.SetSyn().push_back(val);
            }            
        }
    }
    if (change_made) {
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }

    return ( change_made ? eAction_Erase : eAction_Nothing );
}

CNewCleanup_imp::EAction
CNewCleanup_imp::x_SeqFeatCDSGBQualBC(CSeq_feat& feat, CCdregion& cds, const CGb_qual& gb_qual)
{
    const string& qual = gb_qual.GetQual();
    const string& val  = gb_qual.GetVal();
    
    // transl_except qual -> Cdregion.code_break
    if (NStr::EqualNocase(qual, "transl_except")) {
        return x_ParseCodeBreak(feat, cds, val);
    }

    // codon_start qual -> Cdregion.frame
    if (NStr::EqualNocase(qual, "codon_start")) {
        CCdregion::TFrame frame = GET_FIELD(cds, Frame);
        CCdregion::TFrame new_frame = CCdregion::TFrame(NStr::StringToNonNegativeInt(val));
        if (new_frame == CCdregion::eFrame_one  ||
            new_frame == CCdregion::eFrame_two  ||
            new_frame == CCdregion::eFrame_three) {
            if (frame == CCdregion::eFrame_not_set  ||
                ( FIELD_EQUALS( feat, Pseudo, true ) && ! FIELD_IS_SET(feat, Product) )) {
                cds.SetFrame(new_frame);
                ChangeMade(CCleanupChange::eChangeQualifiers);
            }
            return eAction_Erase;
        }
    }

    // transl_table qual -> Cdregion.code
    if (NStr::EqualNocase(qual, "transl_table")) {
        if ( FIELD_IS_SET(cds, Code) ) {
            const CCdregion::TCode& code = GET_FIELD(cds, Code);
            int transl_table = 1;
            ITERATE (CCdregion::TCode::Tdata, it, code.Get()) {
                if ( FIELD_IS(**it, Id)  &&  GET_FIELD(**it, Id) != 0) {
                    transl_table = GET_FIELD(**it, Id);
                    break;
                }
            }
            
            if (NStr::EqualNocase(NStr::UIntToString(transl_table), val)) {
                return eAction_Erase;
            }
        } else {
            int new_val = NStr::StringToNonNegativeInt(val);
            if (new_val > 0) {
                CRef<CGenetic_code::C_E> gc(new CGenetic_code::C_E);
                SET_FIELD(*gc, Id, new_val);
                cds.SetCode().Set().push_back(gc);
                
                // we don't have to check except-text because we're 
                // setting an unset genetic_code, not changing an existing one
                // (the except-text would be: "genetic code exception")
                ChangeMade(CCleanupChange::eChangeGeneticCode);
                return eAction_Erase;
            }
        }
    }

    // look for qualifiers that should be applied to protein feature
    // note - this should be moved to the "indexed" portion of basic cleanup,
    // because it needs to locate another sequence and feature
    if (NStr::Equal(qual, "product") || NStr::Equal (qual, "function") || NStr::Equal (qual, "EC_number")
        || NStr::Equal (qual, "prot_note"))  
    {
        // get protein sequence for product
        CRef<CSeq_feat> prot_feat;
        CRef<CProt_ref> prot_ref;
        CFeat_CI feat_ci;
        // try to get existing prot_feat
        CBioseq_Handle prot_handle;
        if ( FIELD_IS_SET(feat, Product) ) {
            const CSeq_id *prod_seq_id = feat.GetProduct().GetId();
            if( prod_seq_id != NULL ) {
                prot_handle = m_Scope->GetBioseqHandle(*prod_seq_id);
            }
        }
        if( prot_handle ) {
            // find main protein feature
            SAnnotSelector sel(CSeqFeatData::eSubtype_prot);
            feat_ci = CFeat_CI(prot_handle, sel);
            if( feat_ci ) {
                prot_feat.Reset( new CSeq_feat );
                prot_feat->Assign(feat_ci->GetOriginalFeature());
                prot_ref.Reset( &prot_feat->SetData().SetProt() );
            }
        }
        bool push_back_xref_on_success = false;
        CRef<CSeqFeatXref> xref;
        if ( ! prot_ref ) {
            // otherwise make cross reference
            prot_ref.Reset( new CProt_ref );

            // see if this seq-feat already has a prot xref
            EDIT_EACH_SEQFEATXREF_ON_SEQFEAT( xref_iter, feat ) {
                if( (*xref_iter)->IsSetData() && (*xref_iter)->GetData().IsProt() ) {
                    xref = *xref_iter;
                }
            }
            // seq-feat has no prot xref. We make our own.
            if ( ! xref ) {
                xref.Reset( new CSeqFeatXref );
                xref->SetData().SetProt( *prot_ref );
                // we will push the xref onto the feat if the add was successful
                push_back_xref_on_success = true;
            }
            prot_ref.Reset( &xref->SetData().SetProt() );
        }

        // replacement prot feature
        EAction action = eAction_Nothing;

        if (NStr::Equal(qual, "prot_note") ) {
            if( prot_feat ) {
                if (!prot_feat->IsSetComment() || NStr::IsBlank (prot_feat->GetComment())) {
                    SET_FIELD( *prot_feat, Comment, val);
                } else {
                    SET_FIELD( *prot_feat, Comment, (prot_feat->GetComment() + "; " + val) );
                }
                ChangeMade (CCleanupChange::eChangeComment);
                action = eAction_Erase;
            }
        } else {
            action = x_ProtGBQualBC( *prot_ref, gb_qual, eGBQualOpt_CDSMode );
        }

        if( feat_ci && prot_feat ) {
            CSeq_feat_EditHandle edit_feature_handle(feat_ci->GetSeq_feat_Handle());
            edit_feature_handle.Replace(*prot_feat);
            ChangeMade(CCleanupChange::eCleanSeqFeatXrefs);
        }
        if( push_back_xref_on_success ) {
            feat.SetXref().push_back( xref );
            ChangeMade(CCleanupChange::eCleanSeqFeatXrefs);
        }

        return action;
    }

    if (NStr::EqualNocase(qual, "translation")) {
        return eAction_Erase;
    }

    return eAction_Nothing;
}

typedef SStaticPair<const char *, int> TTrnaKey;

static const TTrnaKey trna_key_to_subtype [] = {
    {  "Ala",            'A'  },
    {  "Alanine",        'A'  },
    {  "Arg",            'R'  },
    {  "Arginine",       'R'  },
    {  "Asn",            'N'  },
    {  "Asp",            'D'  },
    {  "Asp or Asn",     'B'  },
    {  "Asparagine",     'N'  },
    {  "Aspartate",      'D'  },
    {  "Aspartic Acid",  'D'  },
    {  "Asx",            'B'  },
    {  "Cys",            'C'  },
    {  "Cysteine",       'C'  },
    {  "fMet",           'M'  },
    {  "Gln",            'Q'  },
    {  "Glu",            'E'  },
    {  "Glu or Gln",     'Z'  },
    {  "Glutamate",      'E'  },
    {  "Glutamic Acid",  'E'  },
    {  "Glutamine",      'Q'  },
    {  "Glx",            'Z'  },
    {  "Gly",            'G'  },
    {  "Glycine",        'G'  },
    {  "His",            'H'  },
    {  "Histidine",      'H'  },
    {  "Ile",            'I'  },
    {  "Isoleucine",     'I'  },
    {  "Leu",            'L'  },
    {  "Leu or Ile",     'J'  },
    {  "Leucine",        'L'  },
    {  "Lys",            'K'  },
    {  "Lysine",         'K'  },
    {  "Met",            'M'  },
    {  "Methionine",     'M'  },
    {  "OTHER",          'X'  },
    {  "Phe",            'F'  },
    {  "Phenylalanine",  'F'  },
    {  "Pro",            'P'  },
    {  "Proline",        'P'  },
    {  "Pyl",            'O'  },
    {  "Pyrrolysine",    'O'  },
    {  "Sec",            'U'  },
    {  "Selenocysteine", 'U'  },
    {  "Ser",            'S'  },
    {  "Serine",         'S'  },
    {  "Ter",            '*'  },
    {  "TERM",           '*'  },
    {  "Termination",    '*'  },
    {  "Thr",            'T'  },
    {  "Threonine",      'T'  },
    {  "Trp",            'W'  },
    {  "Tryptophan",     'W'  },
    {  "Tyr",            'Y'  },
    {  "Tyrosine",       'Y'  },
    {  "Val",            'V'  },
    {  "Valine",         'V'  },
    {  "Xle",            'J'  },
    {  "Xxx",            'X'  }
};

typedef CStaticPairArrayMap <const char*, int, PNocase_CStr> TTrnaMap;
DEFINE_STATIC_ARRAY_MAP(TTrnaMap, sm_TrnaKeys, trna_key_to_subtype);

// This maps in the opposite direction of sm_TrnaKeys
class CAminoAcidCharToSymbol : public multimap<char, const char*, PNocase_LessChar> 
{
public:
    CAminoAcidCharToSymbol( const TTrnaKey keys[], int num_keys )
    {
        int ii = 0;
        for( ; ii < num_keys; ++ii ) {
            insert(value_type( keys[ii].second, keys[ii].first ));
        }
    }
};
const static CAminoAcidCharToSymbol sm_TrnaInverseKeys
    ( trna_key_to_subtype, 
      (sizeof(trna_key_to_subtype) / sizeof(trna_key_to_subtype[0])) );

static CRef<CTrna_ext> s_ParseTRnaFromAnticodonString (const string &str, const CSeq_feat& feat, CScope *scope)
{
    CRef<CTrna_ext> trna;
    
    if (NStr::IsBlank (str)) return trna;

    if (NStr::StartsWith (str, "(pos:")) {
        // find position of closing paren
        string::size_type pos_end = s_MatchingParenPos( str, 0 );
        if (pos_end != string::npos) {
            trna.Reset( new CTrna_ext );
            string pos_str = str.substr (5, pos_end - 5);
            string::size_type aa_start = NStr::FindNoCase (pos_str, "aa:");
            if (aa_start != string::npos) {
                string abbrev = pos_str.substr (aa_start + 3);
                TTrnaMap::const_iterator t_iter = sm_TrnaKeys.find (abbrev.c_str ());
                if (t_iter == sm_TrnaKeys.end ()) {
                    // unable to parse
                    return trna;
                }
                CRef<CTrna_ext::TAa> aa(new CTrna_ext::TAa);
                aa->SetIupacaa (t_iter->second);
                trna->SetAa(*aa);
                pos_str = pos_str.substr (0, aa_start);
                NStr::TruncateSpacesInPlace (pos_str);
                if (NStr::EndsWith (pos_str, ",")) {
                    pos_str = pos_str.substr (0, pos_str.length() - 1);
                }
            }
            CRef<CSeq_loc> anticodon = ReadLocFromText (pos_str, feat.GetLocation().GetId(), scope);
            if( anticodon ) {
                anticodon->SetStrand(eNa_strand_plus); // anticodon is always on plus strand
            }
            if (anticodon == NULL) {
                trna->ResetAa();
            } else {
                trna->SetAnticodon(*anticodon);
            }
        }
    }
    return trna;        
}

static
char s_FindTrnaAA( const string &str )
{
    if ( str.empty() ) return '\0';
    string tmp = str;
    NStr::TruncateSpacesInPlace(tmp);
    
    if( tmp.length() == 1 ) {
        // if the string is a valid one-letter code, just return that
        const char aminoAcidLetter = toupper(tmp[0]);
        if( sm_TrnaInverseKeys.find(aminoAcidLetter) != sm_TrnaInverseKeys.end() ) {
            return aminoAcidLetter;
        }
    } else {
        // translate 3-letter codes and full-names to one-letter codes
        TTrnaMap::const_iterator trna_iter = sm_TrnaKeys.find (tmp.c_str ());
        if( trna_iter != sm_TrnaKeys.end() ) {
            return trna_iter->second;
        }
    }

    return '\0';
}

class CCharInSet {
public:
    CCharInSet( const string &list_of_characters ) {
        copy( list_of_characters.begin(), list_of_characters.end(),
            inserter( char_set, char_set.begin() ) );
    }

    bool operator()( const char ch ) {
        return ( char_set.find(ch) != char_set.end() );
    }

private:
    set<char> char_set;
};

static
void s_TokenizeTRnaString (const string &tRNA_string, list<string> &out_string_list )
{
    out_string_list.clear();
    if ( tRNA_string.empty() ) return;

    // SGD Tx(NNN)c or Tx(NNN)c#, where x is the amino acid, c is the chromosome (A-P, Q for mito),
    // and optional # is presumably for individual tRNAs with different anticodons and the same
    // amino acid.
    CCachedRegexp valid_sgd_regex = regexpCache.Get(
        "^[Tt][A-Za-z]\\(...\\)[A-Za-z]\\d?\\d?$");
    if ( valid_sgd_regex->IsMatch(tRNA_string) ) {
        // parse SGD tRNA anticodon
        out_string_list.push_back(kEmptyStr);
        string &new_SGD_tRNA_anticodon = out_string_list.back();
        string raw_codon_part = tRNA_string.substr(3,3);
        NStr::ToUpper( raw_codon_part );
        string reverse_complement;
        CSeqManip::ReverseComplement( raw_codon_part, CSeqUtil::e_Iupacna, 0, 3, reverse_complement );
        new_SGD_tRNA_anticodon = string("(") + reverse_complement + ')';

        // parse SGD tRNA amino acid
        out_string_list.push_back(tRNA_string.substr(1,1));
        return;
    }

    string tRNA_string_copy = tRNA_string;
    // Note that we do NOT remove "*", since it might be a terminator tRNA symbol
    replace_if( tRNA_string_copy.begin(), tRNA_string_copy.end(), 
        CCharInSet("-,;:()=\'_~"), ' ' );

    vector<string> tRNA_tokens;
    // " \t\n\v\f\r" are the standard whitespace chars
    // ( source: http://www.cplusplus.com/reference/clibrary/cctype/isspace/ )
    NStr::Tokenize( tRNA_string_copy, " \t\n\v\f\r", tRNA_tokens, NStr::eMergeDelims );

    EDIT_EACH_STRING_IN_VECTOR( tRNA_token_iter, tRNA_tokens ) {
        string &tRNA_token = *tRNA_token_iter;
        // remove initial "tRNA", if any
        if ( NStr::StartsWith(tRNA_token, "tRNA", NStr::eNocase) ) {
            tRNA_token = tRNA_token.substr(4);
        }
        CCachedRegexp threeLettersPlusDigits = regexpCache.Get(
            "^[A-Za-z][A-Za-z][A-Za-z]\\d*$");
        if (! tRNA_token.empty() ) {
            if ( threeLettersPlusDigits->IsMatch(tRNA_token) ) {
                tRNA_token = tRNA_token.substr(0, 3);
            }
            out_string_list.push_back(tRNA_token);
        }
    }
}

static const char *codonLetterExpand [] =
{
  "?", "A", "C", "AC",
  "G", "AG", "CG", "ACG",
  "T", "AT", "CT", "ACT",
  "GT", "AGT", "CGT", "ACGT",
  NULL
};

static
bool s_ParseDegenerateCodon( CTrna_ext & tRNA, string & codon )
{
  const static string intToChr = "?ACMGRSVTWYHKDBN";

  if( codon.length() < 3 ) {
      return false;
  }

  // the first two have to be real nucleotides
  const string::size_type first_bad_char = codon.find_first_not_of("ACGT");
  if( first_bad_char != string::npos && first_bad_char < 2 ) {
      return false;
  }

  int idx = intToChr.find( codon [2] );
  if (idx == (int)string::npos ) return false;

  const char *expanded_codon_letter = codonLetterExpand [idx];
  const char *iter = expanded_codon_letter;
  char ch = *iter;
  int tRNA_codon_idx = 0;
  codon.erase(3);
  tRNA.SetCodon().clear();
  while ( *iter != '\0' && tRNA_codon_idx < 6 ) {
    codon [2] = ch;
    tRNA.SetCodon().push_back( CGen_code_table::CodonToIndex(codon) ); // TODO: make sure Seq_code_iupacna

    // prepare for next iteration
    iter++;
    ch = *iter;
    tRNA_codon_idx++;
  }

  return true;
}

// based on C's ParseTRnaString
static 
char s_ParseSeqFeatTRnaString( const string &comment, bool *out_justTrnaText, string &tRNA_codon, bool noSingleLetter )
{
    if (out_justTrnaText != NULL) {
        *out_justTrnaText = false;
    }
    tRNA_codon.clear();

    if ( comment.empty() ) return '\0';

    CRef<CTrna_ext> tr( new CTrna_ext );

    char aa = '\0';
    list<string> head;
    s_TokenizeTRnaString (comment, head);
    bool justt = true;
    list<string>::const_iterator head_iter = head.begin();
    bool is_A = false;
    bool is_ambig = false;
    for( ; head_iter != head.end(); ++head_iter ) {
        const string &str = *head_iter;
        if( str.empty() ) continue;
        char curraa = '\0';
        if (noSingleLetter && str.length() == 1) {
            curraa = '\0';
        } else {
            curraa = s_FindTrnaAA (str);
        }
        if( curraa == 'A' && str.length() == 1 ) {
            is_A = true;
            curraa = 0;
        } else if (curraa != '\0') {
            if (aa == '\0') {
                aa = curraa;
            } else if( curraa != aa) {
                is_ambig = true;
            }
        } else if ( ! NStr::EqualNocase ("tRNA", str) &&
            ! NStr::EqualNocase ("transfer", str) &&
            ! NStr::EqualNocase ("RNA", str) &&
            ! NStr::EqualNocase ("product", str) ) 
        {
            if ( str.length() == 3) {
                tRNA_codon = str;
                NStr::ReplaceInPlace( tRNA_codon, "U", "T" );
                if (s_ParseDegenerateCodon ( *tr, tRNA_codon)) {
                    tRNA_codon.clear();
                    copy( tr->GetCodon().begin(), tr->GetCodon().end(), back_inserter(tRNA_codon) );
                    justt = false;
                } else {
                    justt = false;
                }
            } else {
                justt = false;
            }
        }
    }
    if( is_A && aa == 0 ) {
        aa = 'A';
    }
    if( is_ambig ) {
        aa = 0;
    }

    if (justt) {
        if( comment.find_first_of("0123456789") != string::npos ) {
            justt = false;
        }
    }
    if (out_justTrnaText != NULL) {
        *out_justTrnaText = justt;
    }
    return aa;
}


// homologous to C's HandledGBQualOnRNA.
// That func was copy-pasted, then translated into C++.
// Later we can go back and actually refactor the code
// to make it more efficient or cleaner.
CNewCleanup_imp::EAction 
CNewCleanup_imp::x_SeqFeatRnaGBQualBC(CSeq_feat& feat, CRNA_ref& rna, CGb_qual& gb_qual)
{
    if( ! gb_qual.IsSetVal() ) {
        return eAction_Nothing;
    }
    const string &gb_qual_qual = gb_qual.GetQual();
    string &gb_qual_val = gb_qual.SetVal();
    TRNAREF_TYPE& rna_type = rna.SetType();
    const bool is_std_name = NStr::EqualNocase( gb_qual_qual, "standard_name" );
    if (NStr::EqualNocase( gb_qual_qual, "product" ) ||
        (is_std_name && (! m_SeqEntryInfoStack.top().m_IsEmblOrDdbj) )) 
    {
        if (rna_type == NCBI_RNAREF(unknown)) {
            rna_type = NCBI_RNAREF(other);
            ChangeMade(CCleanupChange::eChangeRNAref);
        }
        if (rna_type == NCBI_RNAREF(other) && is_std_name) return eAction_Nothing;
        if ( rna.IsSetExt() && rna.GetExt().IsName() ) {
            const string &name = rna.SetExt().SetName();
            if ( name.empty() ) {
                rna.ResetExt();
                ChangeMade(CCleanupChange::eChangeRNAref);
            }
        }
        if ( rna.IsSetExt() && rna.GetExt().IsTRNA() ) {
            CRNA_ref_Base::C_Ext::TTRNA& trp = rna.SetExt().SetTRNA();
            if ( ! trp.IsSetAa() && ! trp.IsSetAnticodon() ) {
                if( ! trp.IsSetCodon() || trp.GetCodon().empty() ) {
                    rna.ResetExt();
                    ChangeMade(CCleanupChange::eChangeRNAref);
                }
            }
        }
        if (rna_type == NCBI_RNAREF(tRNA) && rna.IsSetExt() && rna.GetExt().IsName() ) {
            const string &name = rna.SetExt().SetName();
            bool justTrnaText = false;
            string codon;
            char aa = s_ParseSeqFeatTRnaString( name, &justTrnaText, codon, false );
            if (aa != '\0') {
                const bool is_fMet = ( NStr::Find(name, "fMet") != NPOS );
                CRNA_ref_Base::C_Ext::TTRNA &trp = rna.SetExt().SetTRNA();
                trp.SetAa().SetNcbieaa(aa);
                if (justTrnaText) {
                    copy( codon.begin(), codon.end(), back_inserter(trp.SetCodon()) );
                }
                if (aa == 'M') {
                    if (is_fMet) {
                        if ( ! feat.IsSetComment() ) {
                            feat.SetComment("fMet");
                        } else {
                            feat.SetComment() += "; fMet";
                        }
                    }
                }
                x_SeqFeatTRNABC(feat, trp);
                ChangeMade(CCleanupChange::eChangeRNAref);
            }
        }
        if (rna_type == NCBI_RNAREF(tRNA) && ! rna.IsSetExt() ) {
            // this part inserted from: AddQualifierToFeature (sfp, "product", gb_qual_val);
            bool justTrnaText = false;
            string codon;
            char aa = s_ParseSeqFeatTRnaString (gb_qual_val, &justTrnaText, codon, false);
            if (aa != '\0') {
                
                CRNA_ref_Base::C_Ext::TTRNA& trna = rna.SetExt().SetTRNA();
                trna.SetAa().SetNcbieaa(aa);
                
                if (justTrnaText) {
                    copy( codon.begin(), codon.end(), back_inserter(trna.SetCodon()) );
                } else {
                    if( ! feat.IsSetComment() ) {
                        feat.SetComment(gb_qual_val);
                    } else {
                        feat.SetComment() += "; " + gb_qual_val;
                    }
                }
                
                if (aa == 'M') {
                    if( NStr::Find(gb_qual_val, "fMet") != NPOS ) {
                        if ( ! feat.IsSetComment() ) {
                            feat.SetComment("fMet");
                        } else {
                            feat.SetComment() += "; fMet";
                        }
                    }
                }

                ChangeMade(CCleanupChange::eChangeRNAref);
            } else {
                if ( ! feat.IsSetComment() ) {
                    feat.SetComment(gb_qual_val);
                } else {
                    feat.SetComment() += "; ";
                    feat.SetComment() += gb_qual_val;
                }
                ChangeMade(CCleanupChange::eChangeComment);
            }
            return eAction_Erase;
        }
        if (rna_type == NCBI_RNAREF(tRNA) && rna.IsSetExt() && rna.GetExt().IsTRNA() ) {
            CRNA_ref_Base::C_Ext::TTRNA& trp = rna.SetExt().SetTRNA();
            if ( trp.IsSetAa() && trp.GetAa().IsNcbieaa() ) {
                string ignored;
                if ( trp.GetAa().GetNcbieaa() == s_ParseSeqFeatTRnaString (gb_qual_val, NULL, ignored, false)) {
                    return eAction_Erase;
                }
            }
        }
        if( FIELD_IS_SET_AND_IS(rna, Ext, Gen) ) {
            CRNA_gen & rna_gen = rna.SetExt().SetGen();
            if( RAW_FIELD_IS_EMPTY_OR_UNSET(rna_gen, Product) ) {
                rna_gen.SetProduct(gb_qual_val);
                ChangeMade(CCleanupChange::eChangeRNAref);
                return eAction_Erase;
            }
            return eAction_Nothing;
        }
        if ( rna.IsSetExt() && ! rna.GetExt().IsName() ) return eAction_Nothing;
        const string &name = ( rna.IsSetExt() ? rna.GetExt().GetName() : kEmptyStr );
        if (! name.empty() ) {
            SIZE_TYPE rDNA_pos = NStr::Find( gb_qual_val, "rDNA");
            if (rDNA_pos != NPOS) {
                gb_qual_val[rDNA_pos+1] = 'R';
                ChangeMade(CCleanupChange::eChangeQualifiers);
            }
            if ( NStr::EqualNocase(name, gb_qual_val) ) {
                return eAction_Erase;
            }
            if (rna_type == NCBI_RNAREF(other) || rna_type == NCBI_RNAREF(ncRNA) || 
                rna_type == NCBI_RNAREF(tmRNA) || rna_type == NCBI_RNAREF(miscRNA) ) 
            {
                // new convention follows ASN.1 spec comments, allows new RNA types
                return eAction_Nothing;
            }
            // subsequent /product now added to comment
            if ( ! feat.IsSetComment() ) {
                feat.SetComment( gb_qual_val );
                gb_qual.ResetVal();
            } else if ( NStr::Find(gb_qual_val, feat.GetComment()) == NPOS) {
                feat.SetComment() += "; ";
                feat.SetComment() += gb_qual_val;
            }
            ChangeMade(CCleanupChange::eChangeComment);
            return eAction_Erase;
        }
        if (rna_type == NCBI_RNAREF(ncRNA) || 
            rna_type == NCBI_RNAREF(tmRNA) || rna_type == NCBI_RNAREF(miscRNA) ) 
        {
            // new convention follows ASN.1 spec comments, allows new RNA types
            return eAction_Nothing;
        }
        if ( ! FIELD_CHOICE_EQUALS( rna, Ext, Name, gb_qual_val) ) {
            rna.SetExt().SetName( gb_qual_val );
            ChangeMade(CCleanupChange::eChangeRNAref);
            return eAction_Erase;
        }
    } else if (NStr::EqualNocase(gb_qual_qual, "anticodon") ) {
        if (!rna.IsSetType()) {
            rna.SetType(CRNA_ref::eType_tRNA);
            ChangeMade(CCleanupChange::eChangeKeywords);
        }
        _ASSERT(rna.IsSetType());
        CRNA_ref::TType type = rna.GetType();
        if (type == CRNA_ref::eType_unknown) {
            rna.SetType(CRNA_ref::eType_tRNA);
            ChangeMade(CCleanupChange::eChangeKeywords);
        } else if (type != CRNA_ref::eType_tRNA) {
            return eAction_Nothing;
        }
        if (!rna.IsSetExt()) {
            rna.SetExt().SetTRNA();
            ChangeMade(CCleanupChange::eChangeRNAref);
        }
        if ( rna.IsSetExt()  &&
             rna.GetExt().Which() == NCBI_RNAEXT(TRNA) ) {
            
            CRef<CTrna_ext> trna = s_ParseTRnaFromAnticodonString( gb_qual.GetVal(), feat, m_Scope );
            if( ! trna ) {
                return eAction_Nothing;
            }

            x_SeqFeatTRNABC( feat, *trna );
            if (trna->IsSetAa() || trna->IsSetAnticodon()) {
                // don't apply at all if there are conflicts
                bool apply_aa = false;
                bool apply_anticodon = false;
                bool ok_to_apply = true;
                
                // look for conflict with aa
                if (trna->IsSetAa() ) {
                    if (rna.GetExt().GetTRNA().IsSetAa() ) {
                        if( rna.GetExt().GetTRNA().GetAa().IsIupacaa() ) {
                            if (trna->GetAa().GetIupacaa() != rna.GetExt().GetTRNA().GetAa().GetIupacaa()) {
                                ok_to_apply = false;
                            }
                        }
                    } else {
                        apply_aa = true;
                    }
                }
                // look for conflict with anticodon
                if (trna->IsSetAnticodon()) {
                    if (rna.GetExt().GetTRNA().IsSetAnticodon()) {
                        if (sequence::Compare(rna.GetExt().GetTRNA().GetAnticodon(), trna->GetAnticodon(), m_Scope) != sequence::eSame) {
                            ok_to_apply = false;
                        }
                    } else {
                        apply_anticodon = true;
                    }
                }

                if (ok_to_apply) {
                    if (apply_aa ) {
                        rna.SetExt().SetTRNA().SetAa().SetIupacaa(trna->GetAa().GetNcbieaa());
                        ChangeMade (CCleanupChange::eChange_tRna);
                    }
                    if (apply_anticodon) {
                        CRef<CSeq_loc> anticodon(new CSeq_loc());
                        anticodon->Add (trna->GetAnticodon());
                        rna.SetExt().SetTRNA().SetAnticodon(*anticodon);
                        ChangeMade (CCleanupChange::eChangeAnticodon);
                    }
                    return eAction_Erase;
                }
            }
        }
    }
    return eAction_Nothing;
}


CNewCleanup_imp::EAction CNewCleanup_imp::x_ParseCodeBreak(const CSeq_feat& feat, CCdregion& cds, const string& str)
{
    if( str.empty() || ! feat.IsSetLocation() ) {
        return eAction_Nothing;
    }

    const CSeq_id* feat_loc_seq_id = feat.GetLocation().GetId();
    if( ! feat_loc_seq_id ) {
        return eAction_Nothing;
    }

    string::size_type aa_pos = NStr::Find(str, "aa:");
    string::size_type len = 0;
    string::size_type loc_pos, end_pos;
    char protein_letter = 'X';
    CRef<CSeq_loc> break_loc;
    
    if (aa_pos == string::npos) {
        aa_pos = NStr::Find (str, ",");
        if (aa_pos != string::npos) {
            aa_pos = NStr::Find (str, ":", aa_pos);
        }
        if (aa_pos != string::npos) {
            aa_pos ++;
        }
    } else {
        aa_pos += 3;
    }

    if (aa_pos != string::npos) {    
        while (aa_pos < str.length() && isspace (str[aa_pos])) {
            aa_pos++;
        }
        while (aa_pos + len < str.length() && isalpha (str[aa_pos + len])) {
            len++;
        }
        if (len != 0) {    
            protein_letter = ValidAminoAcid(str.substr(aa_pos, len));
        }
    }
    
    loc_pos = NStr::Find (str, "(pos:");
    if (loc_pos == string::npos) {
        return eAction_Nothing;
    }
    loc_pos += 5;
    while (loc_pos < str.length() && isspace (str[loc_pos])) {
        loc_pos++;
    }

    end_pos = NStr::Find (str, ",aa:", loc_pos);
    if( end_pos == NPOS ) {
        end_pos = NStr::Find (str, ",", loc_pos);
        if (end_pos == NPOS) {
            end_pos = str.length();
        }
    }

    string pos = NStr::TruncateSpaces(str.substr(loc_pos, end_pos - loc_pos));

    // handle multi-interval positions by adding a join() around them
    if( pos.find_first_of(",") != string::npos ) {
        pos = "join(" + pos + ")";
    }

    break_loc = ReadLocFromText (pos, feat_loc_seq_id, m_Scope);
    if( FIELD_IS_SET(feat.GetLocation(), Strand) && GET_FIELD(feat.GetLocation(), Strand) != eNa_strand_unknown ) {
        break_loc->SetStrand( GET_FIELD( feat.GetLocation(), Strand) );
    } else {
        RESET_FIELD( *break_loc, Strand );
    }
    
    if (break_loc == NULL 
        || (break_loc->IsInt() && sequence::Compare (*break_loc, feat.GetLocation(), m_Scope) != sequence::eContained )
        || (break_loc->IsInt() && sequence::GetLength(*break_loc, m_Scope) != 3)) {
        return eAction_Nothing;
    }
    
    // need to build code break object and add it to coding region
    CRef<CCode_break> newCodeBreak(new CCode_break());
    CCode_break::TAa& aa = newCodeBreak->SetAa();
    aa.SetNcbieaa(protein_letter);
    newCodeBreak->SetLoc (*break_loc);

    CCdregion::TCode_break& orig_list = cds.SetCode_break();
    orig_list.push_back(newCodeBreak);
    
    ChangeMade(CCleanupChange::eChangeCodeBreak);
    
    return eAction_Erase;
}

CNewCleanup_imp::EAction 
CNewCleanup_imp::x_ProtGBQualBC(CProt_ref& prot, const CGb_qual& gb_qual, EGBQualOpt opt )
{
    const string& qual = gb_qual.GetQual();
    const string& val  = gb_qual.GetVal();

    if (NStr::EqualNocase(qual, "product")  ||  NStr::EqualNocase(qual, "standard_name")) {
        if ( opt == eGBQualOpt_CDSMode || !prot.IsSetName()  ||  NStr::IsBlank(prot.GetName().front())) {
            if( opt == eGBQualOpt_normal ) {
                prot.SetName().push_back(val);
            } else {
                prot.SetName().push_front(val);
            }
            ChangeMade(CCleanupChange::eChangeQualifiers);
        } else {
            return eAction_Nothing;
        }
    } else if (NStr::EqualNocase(qual, "function")) {
        ADD_STRING_TO_LIST( prot.SetActivity(), val );
        ChangeMade(CCleanupChange::eChangeQualifiers);
    } else if (NStr::EqualNocase(qual, "EC_number")) {
        ADD_STRING_TO_LIST( prot.SetEc(), val );
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }

    // labels to leave alone
    static const char * const ignored_quals[] = 
        { "label", "allele", "experiment", "inference", "UniProtKB_evidence" };
    static set<string, PNocase> ignored_quals_raw; 

    // the mutex is just there in the unlikely event that two separate
    // threads both try to initialized ignored_quals_raw.  It's NOT
    // needed for reading
    static CMutex ignored_quals_raw_initialization_mutex;
    {
        CMutexGuard guard(ignored_quals_raw_initialization_mutex);
        if( ignored_quals_raw.empty() ) {
            copy( ignored_quals, ignored_quals + sizeof(ignored_quals)/sizeof(ignored_quals[0]),
                  inserter(ignored_quals_raw, ignored_quals_raw.begin()) );
        }
    }

    if( ignored_quals_raw.find(qual) != ignored_quals_raw.end() ) {
        return eAction_Nothing;
    }

    // all other gbquals not appropriate on protein features
    return eAction_Erase;
}


void CNewCleanup_imp::BioSourceEC(CBioSource& biosrc)
{
    x_AddEnvSamplOrMetagenomic(biosrc);
}


void CNewCleanup_imp::x_AddEnvSamplOrMetagenomic(CBioSource& biosrc)
{
    // add environmental_sample or metagenomic based on lineage or div
    if ( biosrc.IsSetOrg() && biosrc.GetOrg().IsSetOrgname()) {
        bool needs_env_sample = false;
        bool needs_metagenomic = false;
        if (biosrc.GetOrg().GetOrgname().IsSetLineage()) {
            string lineage = biosrc.GetOrg().GetOrgname().GetLineage();
            if (NStr::FindNoCase(lineage, "environmental sample") != string::npos) {
                needs_env_sample = true;
            }
            if (NStr::FindNoCase(lineage, "metagenomes") != string::npos) {
                needs_metagenomic = true;
            }
        }
        if (biosrc.GetOrg().GetOrgname().IsSetDiv()
            && NStr::Equal(biosrc.GetOrg().GetOrgname().GetDiv(), "ENV")) {
            needs_env_sample = true;
        }

        if (needs_env_sample || needs_metagenomic) {
            bool has_env_sample = false;
            bool has_metagenomic = false;
            if ( biosrc.IsSetSubtype()) {
                ITERATE(CBioSource::TSubtype, it, biosrc.GetSubtype()) {
                    if ((*it)->IsSetSubtype()) {
                        if ((*it)->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
                            has_env_sample = true;
                        }
                        if ((*it)->GetSubtype() == CSubSource::eSubtype_metagenomic) {
                            has_metagenomic = true;
                        }
                    }
                }
            }
            if (needs_env_sample && !has_env_sample) {
                CRef<CSubSource> s(new CSubSource(CSubSource::eSubtype_environmental_sample, ""));
                biosrc.SetSubtype().push_back(s);
                ChangeMade(CCleanupChange::eAddSubSource);
            }
            if (needs_metagenomic && !has_metagenomic) {
                CRef<CSubSource> s(new CSubSource(CSubSource::eSubtype_metagenomic, ""));
                biosrc.SetSubtype().push_back(s);
                ChangeMade(CCleanupChange::eAddSubSource);
            }
        }
    }
}


void CNewCleanup_imp::x_FlattenPubEquiv(CPub_equiv& pub_equiv)
{
    CPub_equiv::Tdata& data = pub_equiv.Set();
    
    EDIT_EACH_PUB_ON_PUBEQUIV(pub_iter, pub_equiv ) {
        if( FIELD_IS(**pub_iter, Equiv) ) {
            CPub_equiv& equiv = GET_MUTABLE(**pub_iter, Equiv);
            x_FlattenPubEquiv(equiv);
            copy(equiv.Set().begin(), equiv.Set().end(), back_inserter(data));
            ERASE_PUB_ON_PUBEQUIV( pub_iter, pub_equiv );
            ChangeMade(CCleanupChange::eChangePublication);
        }
    }
}

void CNewCleanup_imp::x_DateStdBC( CDate_std& date )
{
    if ( FIELD_OUT_OF_RANGE(date, Month, 1, 12) ) {
        RESET_FIELD(date, Month);
        ChangeMade(CCleanupChange::eCleanupDate);
    }

    // Maybe we should have the max range set on a per-month basis? (e.g. 30 days for April).
    // ( This could get complex with leap years and such. )
    if ( FIELD_OUT_OF_RANGE(date, Day, 1, 31) ) {
        RESET_FIELD(date, Day);
        ChangeMade(CCleanupChange::eCleanupDate);
    }

    if ( FIELD_OUT_OF_RANGE(date, Second, 0, 59) ) {
        RESET_FIELD(date, Second);
        ChangeMade(CCleanupChange::eCleanupDate);
    }

    if ( ! FIELD_IS_SET(date, Minute) || FIELD_OUT_OF_RANGE(date, Minute, 0, 59) ) {
        if( FIELD_IS_SET(date, Minute) ) {
            RESET_FIELD(date, Minute);
            ChangeMade(CCleanupChange::eCleanupDate);
        }
        if( FIELD_IS_SET(date, Second) ) {
            RESET_FIELD(date, Second);
            ChangeMade(CCleanupChange::eCleanupDate);
        }
    }
    
    if ( ! FIELD_IS_SET(date, Hour) || FIELD_OUT_OF_RANGE(date, Hour, 0, 23) ) {
        if( FIELD_IS_SET(date, Hour) ) {
            RESET_FIELD(date, Hour);
            ChangeMade(CCleanupChange::eCleanupDate);
        }
        if( FIELD_IS_SET(date, Minute) ) {
            RESET_FIELD(date, Minute);
            ChangeMade(CCleanupChange::eCleanupDate);
        }
        if( FIELD_IS_SET(date, Second) ) {
            RESET_FIELD(date, Second);
            ChangeMade(CCleanupChange::eCleanupDate);
        }
    }
}

void CNewCleanup_imp::x_AuthorBC( CAuthor& au, bool fix_initials )
{
    if ( FIELD_IS_SET(au, Name) ) {
        x_PersonIdBC( GET_MUTABLE(au, Name), fix_initials);
    }
}

void CNewCleanup_imp::x_PersonIdBC( CPerson_id& pid, bool fix_initials )
{
    switch (pid.Which()) {
        case NCBI_PERSONID(Name):
            x_NameStdBC( GET_MUTABLE(pid, Name), fix_initials );
            break;
        case NCBI_PERSONID(Ml):
            TRUNCATE_CHOICE_SPACES(pid, Ml);
            break;
        case NCBI_PERSONID(Str):
            TRUNCATE_CHOICE_SPACES(pid, Str);
            break;
        case NCBI_PERSONID(Consortium):
            TRUNCATE_CHOICE_SPACES(pid, Consortium);
            break;
        default:
            break;
    }
}

void CNewCleanup_imp::x_NameStdBC ( CName_std& name, bool fix_initials )
{
    // there's a lot of shuffling around (e.g. adding and removing
    // periods in initials), so we can't determine
    // if we've actually changed anything until we get to the end of 
    // this function.
    CRef<CName_std> original_name( new CName_std );
    original_name->Assign( name );

    // if initials starts with uppercase, we remember to 
    // upcase the whole thing later
    bool upcaseinits = false;
    if( isupper( GET_STRING_FLD_OR_BLANK(name, Initials)[0] ) ) {
        upcaseinits = true;
    }

    string first_initials;
    // like in C: str = NameStdPtrToTabbedString (nsp, fixInitials);
    {
        if ( ! FIELD_IS_SET(name, Suffix) && FIELD_IS_SET(name, Initials) ) {
            x_ExtractSuffixFromInitials(name);
        }
        TRUNCATE_SPACES(name, First);
        if( FIELD_IS_SET(name, Initials) ) {
            NStr::ReplaceInPlace( GET_MUTABLE(name, Initials), ".", "" );
            NStr::TruncateSpacesInPlace( GET_MUTABLE(name, Initials), NStr::eTrunc_Begin );
        }
        if( FIELD_IS_SET(name, Last) ) {
            NStr::TruncateSpacesInPlace( GET_MUTABLE(name, Last), NStr::eTrunc_Begin );
        }
        if( FIELD_IS_SET(name, Middle) ) {
            NStr::TruncateSpacesInPlace( GET_MUTABLE(name, Middle), NStr::eTrunc_Begin );
        }
        x_FixEtAl( name );

        // extract initials from first name
        // like in C: FirstNameToInitials (first, first_initials, sizeof (first_initials) - 1);
        {
            if ( FIELD_IS_SET(name, First) ) {
                const string &first = GET_FIELD(name, First);
                string::size_type next_pos = 0;
                while ( next_pos < first.length() ) {
                    // skip initial spaces and hyphens
                    next_pos = first.find_first_not_of(" -", next_pos);
                    if( string::npos == next_pos ) break;
                    // if we hit an letter after that, copy the letter to inits
                    if( isalpha( first[next_pos] ) ) {\
                        first_initials += first[next_pos];
                    }
                    // find next space or hyphen
                    next_pos = first.find_first_of(" -", next_pos);
                    if( string::npos == next_pos ) break;
                    // if it's a hyphen, copy it
                    if( first[next_pos] == '-' ) {
                        first_initials += '-';
                    }
                }
            }
        }

        if( FIELD_IS_SET(name, First) ) {
            NStr::ReplaceInPlace( GET_MUTABLE(name, First), ".", "" );
            NStr::TruncateSpacesInPlace( GET_MUTABLE(name, First), NStr::eTrunc_Begin );
        }

        if (fix_initials) {
            if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(name, Initials) ) {
                string & initials = GET_MUTABLE(name, Initials);

                // skip part of initials that matches first_initials
                string::size_type initials_first_good_idx = 0;
                for( ; initials_first_good_idx < initials.length() &&
                        initials_first_good_idx < first_initials.length() && 
                        toupper(initials[initials_first_good_idx]) == toupper(first_initials[initials_first_good_idx]) ;
                    ++initials_first_good_idx )
                {
                    // do nothing
                }

                if( initials_first_good_idx > 0 ) {
                    initials.erase( 0, initials_first_good_idx );
                }
            }
        } else if ( RAW_FIELD_IS_EMPTY_OR_UNSET(name, Initials) && ! first_initials.empty() ) {
            SET_FIELD(name, Initials, first_initials );
        }

        if( FIELD_IS_SET(name, Suffix) ) {
            NStr::ReplaceInPlace( GET_MUTABLE(name, Suffix), ".", "" );
            NStr::TruncateSpacesInPlace( GET_MUTABLE(name, Suffix), NStr::eTrunc_Begin );
        }

        // This differs from C, which just deletes these fields.
        CLEAN_STRING_MEMBER(name, Title);
        CLEAN_STRING_MEMBER(name, Full);
    }

    // like in C: nsp = TabbedStringToNameStdPtr (str, fixInitials);
    {
        // initials = remove_spaces(first_initials+initials)
        if( fix_initials && ! first_initials.empty() ) {
            SET_FIELD(name, Initials, 
                first_initials + GET_STRING_FLD_OR_BLANK(name, Initials) );
        }
        if( FIELD_IS_SET(name, Initials) ) {
            string & initials = GET_MUTABLE(name, Initials);
            NStr::ReplaceInPlace( initials, " ", "" );
            NStr::ReplaceInPlace( initials, ",", "." );
            NStr::ReplaceInPlace( initials, ".ST.", ".St." );

            string new_initials;
            string::const_iterator initials_iter = initials.begin();
            // modify initials.  New version will be built in new_initials
            for( ; initials_iter != initials.end(); ++initials_iter ) {
                const char ch = *initials_iter;
                switch( ch ) {
                case '-':
                    // keep hyphens
                    new_initials += '-';
                    break;
                case '.':
                case ' ':
                    // erase periods and spaces
                    break;
                default:
                    // other characters: keep them, BUT...
                    new_initials += ch;

                    if( (initials_iter + 1) != initials.end()) {
                        const char next_char = *(initials_iter + 1);
                        if (! islower(next_char) ) {
                            // if next character is not lower, add period
                            new_initials += '.';
                        }
                    }
                }
            }
            
            if( initials != new_initials ) {
                initials.swap(new_initials); // swap is faster than assignment
                new_initials.clear();
            }

            // add period if string is not empty and doesn't end with a period
            if( ! initials.empty() && ! NStr::EndsWith(initials, ".") ) {
                initials += '.';
            }
        }

        if( FIELD_IS_SET(name, Suffix) ) {
            string &suffix = GET_MUTABLE(name, Suffix);
            // remove spaces
            NStr::ReplaceInPlace( suffix, " ", "" );

            if ( ! suffix.empty() ) {
                // remove any period, if any, on the end
                if( NStr::EndsWith(suffix, ".") ) {
                    suffix.resize( suffix.length() - 1 );
                }

                if( NStr::EqualNocase(suffix, "1d") ) {
                     suffix = "I";
                } else if( NStr::EqualNocase(suffix, "1st") ) {
                     suffix = "I";
                } else if( NStr::EqualNocase(suffix, "2d") ) {
                     suffix = "II";
                } else if( NStr::EqualNocase(suffix, "2nd") ) {
                     suffix = "II";
                } else if( NStr::EqualNocase(suffix, "3d") ) {
                     suffix = "III";
                } else if( NStr::EqualNocase(suffix, "3rd") ) {
                     suffix = "III";
                } else if( NStr::EqualNocase(suffix, "4th") ) {
                     suffix = "IV";
                } else if( NStr::EqualNocase(suffix, "5th") ) {
                     suffix = "V";
                } else if( NStr::EqualNocase(suffix, "6th") ) {
                     suffix = "VI";
                } else if( NStr::EqualNocase(suffix, "Sr") ) {
                     suffix = "Sr.";
                } else if( NStr::EqualNocase(suffix, "Jr") ) {
                     suffix = "Jr.";
                }
            }
        }

        // add dot to "et al"
        if ( FIELD_EQUALS(name, Last, "et al") ) {
            SET_FIELD(name, Last, "et al." );
        }

        // reset middle if it's blank
        if ( FIELD_EQUALS(name, Middle, kEmptyStr) ) {
            RESET_FIELD(name, Middle);
        }
    }

    if (upcaseinits && FIELD_IS_SET(name, Initials) ) {
        string & initials = GET_MUTABLE(name, Initials);
        if( ! initials.empty() && islower(initials[0]) ) {
            initials[0] = toupper(initials[0]);
        }
    }
    CLEAN_STRING_MEMBER(name, Last);
    CLEAN_STRING_MEMBER(name, First);
    CLEAN_STRING_MEMBER(name, Middle);
    CLEAN_STRING_MEMBER(name, Full);
    CLEAN_STRING_MEMBER(name, Initials);
    CLEAN_STRING_MEMBER(name, Suffix);
    CLEAN_STRING_MEMBER(name, Title);
    x_FixEtAl( name );

    if( ! FIELD_IS_SET(name, Last) ) {
        SET_FIELD(name, Last, kEmptyCStr );
    }
    string &last = GET_MUTABLE(name, Last);
    if( RAW_FIELD_IS_EMPTY_OR_UNSET(name, Suffix) &&
        ( NStr::EndsWith(last, " Jr.") || NStr::EndsWith(last, " Sr.") ) ) 
    {
        SET_FIELD(name, Suffix, last.substr( last.length() - 3 ) );
        last.resize( last.length() - 4 );
        NStr::TruncateSpacesInPlace( last );
    }

    if( FIELD_IS_SET(name, Initials) && RAW_FIELD_IS_EMPTY_OR_UNSET(name, Suffix) ) {
        string & initials = GET_MUTABLE(name, Initials);
        if( NStr::EndsWith(initials, ".Jr.") || NStr::EndsWith(initials, ".Sr.") ) {
            SET_FIELD(name, Suffix, initials.substr( initials.length() - 3 ) );
            initials.resize( initials.length() - 3 );
            NStr::TruncateSpacesInPlace( initials );
        }
    }

    if( ! original_name->Equals(name) ) {
        ChangeMade(CCleanupChange::eChangePublication);
    }
}

// mapping of wrong suffixes to the correct ones.
typedef SStaticPair<const char*, const char*> TStringPair;
static const TStringPair bad_sfxs[] = {
    { "1d"  , "I" },
    { "1st" , "I" },
    { "2d"  , "II" },
    { "2nd" , "II" },
    { "3d"  , "III" },
    { "3rd" , "III" },
    { "4th" , "IV" },
    { "5th" , "V" },
    { "6th" , "VI" },
    //{ "I."  , "I" }, // presumably commented out since it resembles initials
    { "II." , "II" },
    { "III.", "III" },
    { "IV." , "IV" },
    { "Jr"  , "Jr." },
    { "Sr"  , "Sr." },    
    //{ "V."  , "V" }, // presumably commented out since it resembles initials
    { "VI." , "VI" }
};
typedef CStaticArrayMap<string, string> TSuffixMap;
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TSuffixMap, sc_BadSuffixes, bad_sfxs);

void CNewCleanup_imp::x_ExtractSuffixFromInitials(CName_std& name)
{
    _ASSERT( FIELD_IS_SET(name, Initials)  &&  ! FIELD_IS_SET(name, Suffix) );

    string& initials = GET_MUTABLE(name, Initials);

    if (initials.find('.') == NPOS) {
        return;
    }

// this macro is arguably more convenient than a function
#define EXTRACTSUFFIXFROMINITIALS( OLD, NEW ) \
    if( NStr::EndsWith(initials, OLD) ) { \
        initials.resize( initials.length() - strlen(OLD) ); \
        SET_FIELD(name, Suffix, NEW); \
        return; \
    }

    EXTRACTSUFFIXFROMINITIALS( "III",  "III" )
    EXTRACTSUFFIXFROMINITIALS( "III.", "III" )
    EXTRACTSUFFIXFROMINITIALS( "Jr",   "Jr" )
    EXTRACTSUFFIXFROMINITIALS( "2nd",  "II" )
    EXTRACTSUFFIXFROMINITIALS( "IV",   "IV" )
    EXTRACTSUFFIXFROMINITIALS( "IV.",  "IV" )

#undef EXTRACTSUFFIXFROMINITIALS
}

void CNewCleanup_imp::x_FixEtAl(CName_std& name)
{
    if( FIELD_EQUALS(name, Last, "et") &&
        ( FIELD_EQUALS(name, Initials, "al")  || 
          FIELD_EQUALS(name, Initials, "al.") ||
          FIELD_EQUALS(name, Initials, "Al.") ) &&
        ( RAW_FIELD_IS_EMPTY_OR_UNSET(name, First) ||
          FIELD_EQUALS(name, Initials, "a") ) )
    {
        RESET_FIELD( name, Initials );
        RESET_FIELD( name, First );
        SET_FIELD( name, Last, "et al." );
    }
}

void CNewCleanup_imp::x_AddReplaceQual(CSeq_feat& feat, const string& str)
{
    if (!NStr::EndsWith(str, ')')) {
        return;
    }

    SIZE_TYPE start = str.find_first_of('\"');
    if (start != NPOS) {
        SIZE_TYPE end = str.find_first_of('\"', start + 1);
        if (end != NPOS) {
            string replace_val = str.substr(start + 1, (end - start) - 1);
            NStr::ToLower(replace_val);
            feat.AddQualifier("replace", replace_val );
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    }
}

void CNewCleanup_imp::x_SeqIntervalBC( CSeq_interval & seq_interval )
{
    // Fix backwards intervals
    if ( seq_interval.CanGetFrom()  &&  seq_interval.CanGetTo()  &&  seq_interval.GetFrom() > seq_interval.GetTo()) {
        swap(seq_interval.SetFrom(), seq_interval.SetTo());
        ChangeMade(CCleanupChange::eChangeSeqloc);
    }
    // change bad strand values.
    if (seq_interval.CanGetStrand()) {
        ENa_strand strand = seq_interval.GetStrand();
        if (strand == eNa_strand_both) {
            seq_interval.SetStrand(eNa_strand_plus);
            ChangeMade(CCleanupChange::eChangeStrand);
        } else if (strand == eNa_strand_both_rev) {
            seq_interval.SetStrand(eNa_strand_minus);
            ChangeMade(CCleanupChange::eChangeStrand);
        } else if (strand == eNa_strand_unknown ) {
            seq_interval.ResetStrand();
            ChangeMade(CCleanupChange::eChangeStrand);
        }
    }
}

void CNewCleanup_imp::x_SplitDbtag( CDbtag &dbt, vector< CRef< CDbtag > > & out_new_dbtags )
{
    // check the common case of nothing to split
    if( ! dbt.IsSetTag() || ! dbt.GetTag().IsStr() ) {
        return;
    }
    if( dbt.GetTag().GetStr().find(":") == string::npos ) {
        return;
    }

    if ( m_SeqEntryInfoStack.top().m_IsEmblOrDdbj) {
        return;
    }

    // split by colon and generate new tags
    vector<string> tags;
    NStr::Tokenize( dbt.GetTag().GetStr(), ":", tags );
    _ASSERT( tags.size() >= 2 );

    // check if we're trying to split something we shouldn't
    if( NStr::EqualNocase( tags.front(), "MGD" ) ||
        NStr::EqualNocase( tags.front(), "MGI" ) ||
        NStr::EqualNocase( tags.front(), "HGNC" ) ||
        NStr::EqualNocase( tags.front(), "RGD" ) ||
        NStr::EqualNocase( tags.front(), "J" ) )
    {
        return;
    }

    // treat the CDbtag argument as the first of the new CDbtags
    dbt.SetTag().SetStr( tags.front() );
    vector<string>::const_iterator str_iter = tags.begin() + 1;
    for( ; str_iter != tags.end(); ++str_iter ) {
        CRef<CDbtag> new_tag( new CDbtag );
        new_tag->Assign( dbt );
        new_tag->SetTag().SetStr( *str_iter );
        out_new_dbtags.push_back( new_tag );
    }

    ChangeMade(CCleanupChange::eCleanDbtag);
}

inline
static
bool s_CodonCompare( const int &codon1, const int &codon2 ) {
    return (codon1 < codon2);
}

inline
static
bool s_CodonEqual( int codon1, int codon2 ) {
    return (codon1 == codon2);
}

static
char s_ConvertTrnaAaToLetter( const CTrna_ext::C_Aa &trna_aa, CSeqUtil::ECoding coding, char *out_aa_char = NULL )
{
    char temp_aa = '\0';

    int num_converted = 0;
    char new_aa = '\0';
    switch( trna_aa.Which() ) {
    case CTrna_ext::C_Aa::e_Iupacaa:
        temp_aa = trna_aa.GetIupacaa();
        num_converted = CSeqConvert::Convert( &temp_aa, CSeqUtil::e_Iupacaa, 0, 1, &new_aa, coding );
        break;
    case CTrna_ext::C_Aa::e_Ncbieaa:
        temp_aa = trna_aa.GetNcbieaa();
        num_converted = CSeqConvert::Convert( &temp_aa, CSeqUtil::e_Ncbieaa, 0, 1, &new_aa, coding );
        break;
    case CTrna_ext::C_Aa::e_Ncbi8aa:
        temp_aa = trna_aa.GetNcbi8aa();
        num_converted = CSeqConvert::Convert( &temp_aa, CSeqUtil::e_Ncbi8aa, 0, 1, &new_aa, coding );
        break;
    case CTrna_ext::C_Aa::e_Ncbistdaa:
        temp_aa = trna_aa.GetNcbistdaa();
        num_converted = CSeqConvert::Convert( &temp_aa, CSeqUtil::e_Ncbistdaa, 0, 1, &new_aa, coding );
        break;
    default:
        break;
    }
    if( NULL != out_aa_char ) {
        *out_aa_char = temp_aa;
    }
    if( num_converted > 0 ) {
        return new_aa;
    } else {
        return '\0';
    }
}

void CNewCleanup_imp::x_SeqFeatTRNABC( CSeq_feat& feat, CTrna_ext & tRNA )
{
    const string &comment = ( FIELD_IS_SET(feat, Comment) ? GET_FIELD(feat, Comment) : kEmptyStr );

    if( tRNA.IsSetAa() && tRNA.GetAa().IsIupacaa() ) {
        const int old_value = tRNA.GetAa().GetIupacaa();
        tRNA.SetAa().SetNcbieaa( old_value );
        ChangeMade(CCleanupChange::eChange_tRna);
    }

    if ( FIELD_IS_SET(feat, Comment) ) {
        char aa = '\0';
        char new_aa = '\0';
        if( tRNA.IsSetAa() ) {
            aa = s_ConvertTrnaAaToLetter( tRNA.GetAa(), CSeqUtil::e_Ncbieaa );
        }
        bool justTrnaText = false;
        string tRNA_codon;
        if (aa != 'X') {
            new_aa = s_ParseSeqFeatTRnaString ( comment, &justTrnaText, tRNA_codon, true);
            if (aa == '\0' && new_aa != '\0') {
                aa = new_aa;
                tRNA.SetAa().SetNcbieaa( new_aa );
                ChangeMade(CCleanupChange::eChange_tRna);
            }
            if (aa != '\0' && aa == new_aa) {
                if (justTrnaText) {
                    CTrna_ext::TCodon & arg_codon = GET_MUTABLE( tRNA, Codon );
                    if( arg_codon.size() < tRNA_codon.length() ) {
                        copy( tRNA_codon.begin() + arg_codon.size(), tRNA_codon.end(), back_inserter(arg_codon) );
                        ChangeMade(CCleanupChange::eChange_tRna);
                    }
                    if ( FIELD_IS_SET(feat, Comment) && comment != "fMet" ) {
                        RESET_FIELD(feat, Comment);
                        ChangeMade(CCleanupChange::eChangeComment);
                    }
                }
            }
        } else {
            aa = s_ParseSeqFeatTRnaString ( comment, &justTrnaText, tRNA_codon, true);
            if (aa != '\0') {
                tRNA.SetAa().SetNcbieaa( aa );
                ChangeMade(CCleanupChange::eChange_tRna);
                if (justTrnaText) {
                    CTrna_ext::TCodon & arg_codon = tRNA.SetCodon();
                    if( arg_codon.size() < tRNA_codon.length() ) {
                        copy( tRNA_codon.begin() + arg_codon.size(), tRNA_codon.end(), back_inserter(arg_codon) );
                        ChangeMade(CCleanupChange::eChange_tRna);
                    }
                    if ( FIELD_IS_SET(feat, Comment) && comment != "fMet" ) {
                        RESET_FIELD(feat, Comment);
                        ChangeMade(CCleanupChange::eChangeComment);
                    }
                }
            }
        }
    }

    if (! CODON_ON_TRNAEXT_IS_SORTED(tRNA, s_CodonCompare)) {
        SORT_CODON_ON_TRNAEXT(tRNA, s_CodonCompare);
        ChangeMade(CCleanupChange::eChange_tRna);
    }

    if( ! CODON_ON_TRNAEXT_IS_UNIQUE(tRNA, s_CodonEqual) ) {
        UNIQUE_CODON_ON_TRNAEXT(tRNA, s_CodonEqual);
        ChangeMade(CCleanupChange::eChange_tRna);
    }

    REMOVE_IF_EMPTY_CODON_ON_TRNAEXT(tRNA);
}

static
void s_ParsePCRComponent(vector<string> &out_list, const string *component)
{
    out_list.clear();

    if( component == NULL ) return;
    if ( component->empty() ) return;

    string component_copy = *component; //copy so we can modify it
    // Remove enclosing parens, if any
    const string::size_type len = component_copy.length();
    if ( len > 1 && component_copy[0] == '(' && component_copy[len - 1] == ')' && component_copy.find('(', 1) == string::npos ) {
        component_copy = component_copy.substr( 1, component_copy.length() - 2 );
    }

    NStr::Tokenize( component_copy, string(","), out_list );
    EDIT_EACH_STRING_IN_VECTOR( str_iter, out_list ) {
        NStr::TruncateSpacesInPlace( *str_iter );
    }
}

class CPCRParsedSet {
public:
    CPCRParsedSet( 
        const string * fwd_seq,
        const string * rev_seq,
        const string * fwd_name,
        const string * rev_name ) :
    m_Fwd_seq(      fwd_seq  == NULL ? kEmptyStr : *fwd_seq),
        m_Rev_seq(  rev_seq  == NULL ? kEmptyStr : *rev_seq ),
        m_Fwd_name( fwd_name == NULL ? kEmptyStr : *fwd_name ),
        m_Rev_name( rev_name == NULL ? kEmptyStr : *rev_name ),
        m_Original_order( ms_Next_original_order.Add(1) ) { }

    const string &GetFwdSeq() const { return m_Fwd_seq; }
    const string &GetRevSeq() const { return m_Rev_seq; }
    const string &GetFwdName() const { return m_Fwd_name; }
    const string &GetRevName() const { return m_Rev_name; }

    bool operator <( const CPCRParsedSet &rhs ) {
        const int fwd_seq_comparison = NStr::CompareNocase( m_Fwd_seq, rhs.m_Fwd_seq );
        if( fwd_seq_comparison != 0 ) return fwd_seq_comparison;
        const int rev_seq_comparison = NStr::CompareNocase( m_Rev_seq, rhs.m_Rev_seq );
        if( rev_seq_comparison != 0 ) return rev_seq_comparison;
        const int fwd_name_comparison = NStr::CompareNocase( m_Fwd_name, rhs.m_Fwd_name );
        if( fwd_name_comparison != 0 ) return fwd_name_comparison;
        const int rev_name_comparison = NStr::CompareNocase( m_Rev_name, rhs.m_Rev_name );
        if( rev_name_comparison != 0 ) return rev_name_comparison;
        // last resort
        return m_Original_order - rhs.m_Original_order;
    }

private:
    string m_Fwd_seq;
    string m_Rev_seq;
    string m_Fwd_name;
    string m_Rev_name;
    int m_Original_order;

    static CAtomicCounter_WithAutoInit ms_Next_original_order;
};

CAtomicCounter_WithAutoInit CPCRParsedSet::ms_Next_original_order;

static
void s_ParsePCRSet( const CBioSource &biosrc, list<CPCRParsedSet> &out_pcr_set )
{
    out_pcr_set.clear();

    const string* fwd_primer_seq = NULL;
    const string* rev_primer_seq = NULL;
    const string* fwd_primer_name = NULL;
    const string* rev_primer_name = NULL;

// convenience macro
#define PARSEPCRSET_CASE(Subtype) \
            case NCBI_SUBSOURCE(Subtype): \
            if( (*subsrc_iter)->IsSetName() ) { \
                Subtype = &((*subsrc_iter)->GetName()); \
            } \
            break;


    FOR_EACH_SUBSOURCE_ON_BIOSOURCE( subsrc_iter, biosrc ) {
        SWITCH_ON_SUBSOURCE_CHOICE( **subsrc_iter ) {
        PARSEPCRSET_CASE(fwd_primer_seq)
        PARSEPCRSET_CASE(rev_primer_seq)
        PARSEPCRSET_CASE(fwd_primer_name)
        PARSEPCRSET_CASE(rev_primer_name)
        default:
            // ignore
            break;
        }
    }
#undef PARSEPCRSET_CASE

    // ParsePCRStrings 
    vector<string> fwd_seq_list;
    s_ParsePCRComponent(fwd_seq_list, fwd_primer_seq);
    vector<string> rev_seq_list;
    s_ParsePCRComponent(rev_seq_list, rev_primer_seq);
    vector<string> fwd_name_list;
    s_ParsePCRComponent(fwd_name_list, fwd_primer_name);
    vector<string> rev_name_list;
    s_ParsePCRComponent(rev_name_list, rev_primer_name);

    vector<string>::iterator curr_fwd_seq = fwd_seq_list.begin();
    vector<string>::iterator curr_rev_seq = rev_seq_list.begin();
    vector<string>::iterator curr_fwd_name = fwd_name_list.begin();
    vector<string>::iterator curr_rev_name = rev_name_list.begin();

    while (curr_fwd_seq != fwd_seq_list.end() || 
        curr_rev_seq != rev_seq_list.end()    || 
        curr_fwd_name != fwd_name_list.end()  || 
        curr_rev_name != rev_name_list.end() ) 
    {
        const string *fwd_seq = ( curr_fwd_seq != fwd_seq_list.end() ? &*curr_fwd_seq++ : NULL );
        const string *rev_seq = ( curr_rev_seq != rev_seq_list.end() ? &*curr_rev_seq++ : NULL );
        const string *fwd_name = ( curr_fwd_name != fwd_name_list.end() ? &*curr_fwd_name++ : NULL );
        const string *rev_name = ( curr_rev_name != rev_name_list.end() ? &*curr_rev_name++ : NULL );

        out_pcr_set.push_back( CPCRParsedSet(fwd_seq, rev_seq, fwd_name, rev_name) );
    }
}

// split by colon and trim spaces off the pieces
static
void s_ParsePCRColonString( vector<string> &out_list, const string &str ) 
{
    NStr::Tokenize( str, ":", out_list );
    EDIT_EACH_STRING_IN_VECTOR(str_iter, out_list ) {
        NStr::TruncateSpacesInPlace( *str_iter );
        if( str_iter->empty() ) {
            ERASE_STRING_IN_VECTOR(str_iter, out_list);
        }
    }    
}

static 
CRef<CPCRPrimerSet> s_ModernizePCRPrimerHalf (const string &seq, const string &name)
{
    // Construct the value we will return
    // ( and extract its primer set for easy access )
    CRef<CPCRPrimerSet> return_value( new CPCRPrimerSet );
    list< CRef< CPCRPrimer > > &primer_list = return_value->Set();

    vector<string> seq_list;
    s_ParsePCRColonString (seq_list, seq);
    vector<string> name_list;
    s_ParsePCRColonString (name_list, name);

    vector<string>::const_iterator name_iter = name_list.begin();

    CRef<CPCRPrimer> last_primer;

    // create a PCRPrimer for each seq (and attach its name, if possible)
    FOR_EACH_STRING_IN_VECTOR( seq_iter, seq_list ) {

        const string *curr_name = NULL;
        if ( name_iter != name_list.end() ) {
            curr_name = &*name_iter;
            ++name_iter;
        }

        CRef<CPCRPrimer> curr_primer( new CPCRPrimer );
        curr_primer->SetSeq().Set( *seq_iter );
        if( curr_name != NULL ) {
            curr_primer->SetName().Set( *curr_name );
        }
        primer_list.push_back( curr_primer );
        last_primer = curr_primer;
    }

    if( last_primer ) {
        // attach any leftover names to the end of the name of the last seq
        for ( ; name_iter != name_list.end() ; ++name_iter ) {
            last_primer->SetName().Set() += ":" + *name_iter;
        }
    } else {
        // This differs from C.  C breaks as soon as it's looked at the
        // first name, but this version will create CPCRPrimer for all names.
        for ( ; name_iter != name_list.end() ; ++name_iter ) {
            CRef<CPCRPrimer> curr_primer( new CPCRPrimer );
            curr_primer->SetName().Set( *name_iter );
            primer_list.push_back( curr_primer );
        }
    }

    // If the CPCRPrimerSet contains nothing inside, return a null ref
    if( primer_list.empty() ) {
        return CRef<CPCRPrimerSet>();
    } else {
        return return_value;
    }
}

class CIsBadCRefPCRSubSource {
public:
    bool operator()( const CRef<CSubSource> &subsource ) {
        if( ! subsource ) {
            return true;
        }

        SWITCH_ON_SUBSOURCE_CHOICE( *subsource ) {
        case NCBI_SUBSOURCE(fwd_primer_seq):
        case NCBI_SUBSOURCE(rev_primer_seq):
        case NCBI_SUBSOURCE(fwd_primer_name):
        case NCBI_SUBSOURCE(rev_primer_name):
            return true;
        }

        return false;
    }
};

void CNewCleanup_imp::x_ModernizePCRPrimers( CBioSource &biosrc )
{
    list<CPCRParsedSet> pcr_parsed_list;
    s_ParsePCRSet( biosrc, pcr_parsed_list );
    if( pcr_parsed_list.empty() ) {
        return;
    }

    CRef<CPCRReactionSet> pcr_reaction_set( new CPCRReactionSet );
    list< CRef< CPCRReaction > > &pcr_reaction_list = pcr_reaction_set->Set();

    FOR_EACH_PCRPARSEDSET_IN_LIST( pcr_parsed_list_iter, pcr_parsed_list) {

        CRef<CPCRPrimerSet> forward = 
            s_ModernizePCRPrimerHalf (pcr_parsed_list_iter->GetFwdSeq(), 
            pcr_parsed_list_iter->GetFwdName());
        CRef<CPCRPrimerSet> reverse = 
            s_ModernizePCRPrimerHalf (pcr_parsed_list_iter->GetRevSeq(), 
            pcr_parsed_list_iter->GetRevName());

        if ( forward || reverse ) {
            CRef<CPCRReaction> curr_reaction( new CPCRReaction );
            if( forward ) {
                SET_FIELD( *curr_reaction, Forward, *forward );
            }
            if( reverse ) {
                SET_FIELD( *curr_reaction, Reverse, *reverse );
            }
            pcr_reaction_list.push_back( curr_reaction );
        }
    }

    // only add PCR reaction set if there's something in it
    if ( ! pcr_reaction_list.empty() ) {

        // copy the existing reaction set (if any) to the end of ours
        copy( GET_MUTABLE(biosrc, Pcr_primers).Set().begin(), 
            GET_MUTABLE(biosrc, Pcr_primers).Set().end(), 
            back_inserter(pcr_reaction_list) );
        // we are now the real pcr reaction set
        SET_FIELD( biosrc, Pcr_primers, *pcr_reaction_set );
        ChangeMade(CCleanupChange::eChangePCRPrimers);

        PCRReactionSetBC( GET_MUTABLE(biosrc, Pcr_primers) );

        // remove all old-style PCR primer subsources ( fwd_primer_seq, etc. ) 
        if( FIELD_IS_SET(biosrc, Subtype) ) {
            list< CRef< CSubSource > > &subsources = GET_MUTABLE(biosrc, Subtype);
            list< CRef< CSubSource > >::iterator first_bad_element = 
                remove_if( subsources.begin(), subsources.end(), CIsBadCRefPCRSubSource() );
            if( first_bad_element != subsources.end() ) {
                subsources.erase( first_bad_element, subsources.end() );
                ChangeMade(CCleanupChange::eChangeSubsource);
            }

            REMOVE_IF_EMPTY_SUBSOURCE_ON_BIOSOURCE(biosrc);
        }
    }
}

static
void s_SplitAtSingleTildes( list<string> &piece_vec, const string &str )
{
    if( str.empty() ) {
        return;
    }

    vector<string> pieces;

    // piece_start is the beginning of the piece we're working on,
    // but search_start is where to start looking for tildes on this iteration
    // ( invariant: search_pos >= piece_start_pos )
    string::size_type piece_start_pos = 0;
    string::size_type search_pos = 0;
    while( search_pos < str.length() ) {
        // find the next tilde
        string::size_type tilde_pos = str.find_first_of("~", search_pos);
        if( string::npos == tilde_pos ) {
            tilde_pos = str.length();
        }

        // can we use the tilde as a place to split?
        const bool tilde_is_usable = (
            ( tilde_pos == 0 || str[tilde_pos-1] != ' ' ) &&
            ( tilde_pos >= (str.length()-1) || str[tilde_pos+1] != '~' ) );

        if( tilde_is_usable ) {
            // Great, so split at the tilde, and add the new piece
            piece_vec.push_back( str.substr(piece_start_pos, tilde_pos - piece_start_pos) );
            // trim spaces and remove if trimmed to nothing
            NStr::TruncateSpacesInPlace( piece_vec.back() );
            if( piece_vec.back().empty() ) {
                piece_vec.resize( piece_vec.size() - 1 );
            }
        }

        // skip any tildes after our tilde, regardless of whether it was usable
        search_pos = tilde_pos;
        while( search_pos < str.length() && str[search_pos] == '~' ) {
            ++search_pos;
        }
        
        if( tilde_is_usable ) {
            // begin a new section
            piece_start_pos = search_pos;
        }
    }

    // add the last piece
    piece_vec.push_back( str.substr(piece_start_pos) );

    // trim spaces and remove if trimmed to nothing
    NStr::TruncateSpacesInPlace( piece_vec.back() );
    if( piece_vec.back().empty() ) {
        piece_vec.resize( piece_vec.size() - 1 );
    }
}

typedef map< TORGMOD_SUBTYPE, set<string> >    TExistingOrgModMap;
typedef map< TSUBSOURCE_SUBTYPE, set<string> > TExistingSubsourceMap;

// returns true if subname was changed
static
bool s_CleanupOrgModAndSubSourceOther_helper(
    string &subname, 
    const TExistingOrgModMap &existingOrgModMap, 
    const TExistingSubsourceMap &existingSubsourceMap )
{
    list<string> subname_piece_vec;
    s_SplitAtSingleTildes( subname_piece_vec, subname );

    if( subname_piece_vec.empty() ) {
        if( subname.empty() ) {
            return false;
        } else {
            subname.clear();
            return true;
        }
    }

    // check if any pieces are duplicated elsewhere
    list<string>::iterator piece_iter = subname_piece_vec.begin();
    while( piece_iter != subname_piece_vec.end() ) {
        string &piece = (*piece_iter);
        bool should_erase_piece = false;

        string::size_type val_start_pos = 0;
        TORGMOD_SUBTYPE orgmod_subtype = NCBI_ORGMOD(other);
        TSUBSOURCE_SUBTYPE subsrc_subtype = NCBI_SUBSOURCE(other);
        if( s_StringHasOrgModPrefix(piece, val_start_pos, orgmod_subtype) ) {
            string val = piece.substr(val_start_pos);

            TExistingOrgModMap::const_iterator orgmodmap_iter = 
                existingOrgModMap.find(orgmod_subtype);
            if( orgmodmap_iter != existingOrgModMap.end() ) {
                const set<string> &valsAlreadyThere = orgmodmap_iter->second;
                if( valsAlreadyThere.find(val) != valsAlreadyThere.end() ) {
                    // already exists, so should be removed
                    should_erase_piece = true;
                }
            }
        } else if( s_StringHasSubSourcePrefix(piece, val_start_pos, subsrc_subtype) ) {
            string val = piece.substr(val_start_pos);

            TExistingSubsourceMap::const_iterator subsrcmap_iter =
                existingSubsourceMap.find(subsrc_subtype);
            if( subsrcmap_iter != existingSubsourceMap.end() ) {
                const set<string> &valsAlreadyThere = subsrcmap_iter->second;
                if( valsAlreadyThere.find(val) != valsAlreadyThere.end() ) {
                    // already exists, so should be removed
                    should_erase_piece = true;
                }
            }
        }

        if( should_erase_piece ) {
            piece_iter = subname_piece_vec.erase(piece_iter);
        } else {
            ++piece_iter;
        }
    }

    string new_subname = NStr::Join( subname_piece_vec, "~" );
    if( subname != new_subname ) {
        // swap is faster than assignment
        subname.swap( new_subname );
        return true;
    } else {
        return false;
    }
}

void CNewCleanup_imp::x_CleanupOrgModAndSubSourceOther( COrgName &orgname, CBioSource &biosrc )
{
    // Load each orgmod and subsource into a map for later retrievable
    // ( More efficient than C's quadratic loop-in-a-loop for bigger cases )

    TExistingOrgModMap existingOrgModMap;
    FOR_EACH_ORGMOD_ON_ORGNAME( orgmod_iter, orgname ) {
        const COrgMod &org_mod = **orgmod_iter;
        if( FIELD_IS_SET(org_mod, Subtype) && 
            GET_FIELD(org_mod, Subtype) != NCBI_ORGMOD(other) )
        {
            const string &val = GET_STRING_FLD_OR_BLANK(org_mod, Subname);
            existingOrgModMap[GET_FIELD(org_mod, Subtype)].insert( val );
        }
    }

    TExistingSubsourceMap existingSubsourceMap;
    EDIT_EACH_SUBSOURCE_ON_BIOSOURCE( subsrc_iter, biosrc ) {
        const CSubSource &subsrc = **subsrc_iter;
        if( FIELD_IS_SET(subsrc, Subtype) && 
            GET_FIELD(subsrc, Subtype) != NCBI_SUBSOURCE(other) )
        {
            const string &val = GET_STRING_FLD_OR_BLANK(subsrc, Name);
            existingSubsourceMap[GET_FIELD(subsrc, Subtype)].insert( val );
        }
    }

    // edit orgmods of type "other"

    EDIT_EACH_ORGMOD_ON_ORGNAME( orgmod_iter, orgname ) {
        COrgMod &org_mod = **orgmod_iter;

        // we're only cleaning the ones of type "other"
        if( ! FIELD_EQUALS(org_mod, Subtype, NCBI_ORGMOD(other) ) ||
            ! FIELD_IS_SET(org_mod, Subname) )
        {
            continue;
        }

        string &subname = GET_MUTABLE( org_mod, Subname );
        if( s_CleanupOrgModAndSubSourceOther_helper( subname, existingOrgModMap, existingSubsourceMap ) ) {
            ChangeMade(CCleanupChange::eChangeOrgmod);
        }

        if( subname.empty() ) {
            ERASE_ORGMOD_ON_ORGNAME(orgmod_iter, orgname);
            ChangeMade(CCleanupChange::eRemoveOrgmod);
        }
    }

    // edit subsources of type "other"

    EDIT_EACH_SUBSOURCE_ON_BIOSOURCE( subsrc_iter, biosrc ) {
        CSubSource &subsrc = **subsrc_iter;

        // we're only cleaning the ones of type "other"
        if( ! FIELD_EQUALS(subsrc, Subtype, NCBI_SUBSOURCE(other) ) ||
            ! FIELD_IS_SET(subsrc, Name) ) 
        {
            continue;
        }

        string &name = GET_MUTABLE( subsrc, Name );
        if( s_CleanupOrgModAndSubSourceOther_helper( name, existingOrgModMap, existingSubsourceMap ) ) {
            ChangeMade(CCleanupChange::eChangeSubsource);
        }

        if( name.empty() ) {
            ERASE_SUBSOURCE_ON_BIOSOURCE(subsrc_iter, biosrc);
            ChangeMade(CCleanupChange::eRemoveSubSource);
        }
    }
}

void
CNewCleanup_imp::x_OrgnameModBC( COrgName &orgname, const string &org_ref_common )
{
    if( ! FIELD_IS_SET(orgname, Mod) ) {
        return;
    }

    COrgMod *prev = NULL;

    EDIT_EACH_ORGMOD_ON_ORGNAME( orgmod_iter, orgname ) {
        COrgMod &orgmod = **orgmod_iter;

        bool unlink = false;

        CLEAN_AND_COMPRESS_STRING_MEMBER(orgmod, Subname);
        CLEAN_AND_COMPRESS_STRING_MEMBER(orgmod, Attrib);

        const TORGMOD_SUBTYPE subtype = GET_FIELD(orgmod, Subtype);
        const string &subname = GET_FIELD(orgmod, Subname);

        if ( (subtype == NCBI_ORGMOD(common)) && 
            NStr::EqualNocase(subname, org_ref_common) )
        {
            // if you find this code commented out for a long, long time, you can probably
            // just remove it.  (originally commented-out under JIRA SQD-816)
            //// unlink = true;
        } else if( prev != NULL ) {
            const TORGMOD_SUBTYPE prev_subtype = GET_FIELD(*prev, Subtype);
            const string &prev_subname = GET_FIELD(*prev, Subname);

            if( subname.empty() ) {
                unlink = true;
            } else if ( (prev_subtype == subtype &&
                NStr::EqualNocase(prev_subname, subname)) ||
                (prev_subtype == subtype &&
                prev_subtype ==  NCBI_ORGMOD(other) &&
                NStr::Find(prev_subname, subname) != NPOS )) 
            {
                unlink = true;
            } else if (prev_subtype == subtype &&
                prev_subtype == NCBI_ORGMOD(other) &&
                NStr::Find (subname, prev_subname) != NPOS ) 
            {
                prev->Assign( orgmod );
                unlink = true;
            }
        } else if ( subname.empty() ||
            subname == ")"  ||
            subname == "(" )
        {
            unlink = true;
        }

        if (unlink) {
            ERASE_ORGMOD_ON_ORGNAME(orgmod_iter, orgname);
            ChangeMade(CCleanupChange::eRemoveOrgmod);
        } else {
            prev = &**orgmod_iter;
        }
    }

    COrgMod *omp_anamorph = NULL;
    COrgMod *omp_gb_anamorph = NULL;
    COrgMod *omp_other = NULL;

    EDIT_EACH_ORGMOD_ON_ORGNAME( orgmod_iter, orgname ) {
        const TORGMOD_SUBTYPE subtype = GET_FIELD(**orgmod_iter, Subtype);
        switch( subtype ) {
        case NCBI_ORGMOD(anamorph):
            omp_anamorph = &**orgmod_iter;
            break;
        case NCBI_ORGMOD(gb_anamorph):
            omp_gb_anamorph = &**orgmod_iter;
            break;
        case NCBI_ORGMOD(other):
            omp_other = &**orgmod_iter;
            break;
        }
    }

    bool redund = false;

    static const string kAnamorph = "anamorph:";
    if ( (omp_other != NULL) && NStr::StartsWith(GET_FIELD(*omp_other, Subname), kAnamorph, NStr::eNocase) ) {

        // This part is just to set anamorph_value to the part of the subname
        // after "anamorph:" and spaces.
        const SIZE_TYPE after_anamorph_pos = kAnamorph.length();
        SIZE_TYPE after_anamorph_pos_and_spaces = 
            GET_FIELD(*omp_other, Subname).find_first_not_of(" ", after_anamorph_pos);
        if( after_anamorph_pos_and_spaces == NPOS ) {
            after_anamorph_pos_and_spaces = after_anamorph_pos;
        }
        const string anamorph_value = GET_FIELD(*omp_other, Subname).substr(after_anamorph_pos_and_spaces);

        if (omp_anamorph != NULL) {
            if ( GET_FIELD(*omp_anamorph, Subname) == anamorph_value ) {
                redund = true;
            }
        } else if (omp_gb_anamorph != NULL) {
            if ( GET_FIELD(*omp_gb_anamorph, Subname) == anamorph_value ) {
                redund = true;
            }
        }
    }
    if(redund) {
        // remove omp_other
        EDIT_EACH_ORGMOD_ON_ORGNAME( orgmod_iter, orgname ) {
            if( &**orgmod_iter == omp_other ) {
                ERASE_ORGMOD_ON_ORGNAME(orgmod_iter, orgname);
                ChangeMade(CCleanupChange::eRemoveOrgmod);
                break;
            }
        }
    }
}

void CNewCleanup_imp::x_FixUnsetMolFromBiomol( CMolInfo& molinfo, CBioseq &bioseq )
{
    if( FIELD_IS_SET(molinfo, Biomol) ) 
    {
        const TMOLINFO_BIOMOL biomol = GET_FIELD(molinfo, Biomol);
        if( biomol == NCBI_BIOMOL(unknown) ) {
            RESET_FIELD( molinfo, Biomol );
            ChangeMade(CCleanupChange::eChangeMolInfo);
            return;
        }

        if( FIELD_IS_SET(bioseq, Inst) )
        {
            const TSEQ_MOL mol = ( FIELD_IS_SET(bioseq.GetInst(), Mol) ? 
                GET_FIELD(bioseq.GetInst(), Mol) :
                NCBI_SEQMOL(not_set) );
            
            if( mol == NCBI_SEQMOL(not_set) ) {
                switch( biomol ) {
                case NCBI_BIOMOL(genomic):
                    SET_FIELD( bioseq.SetInst(), Mol, NCBI_SEQMOL(na) );
                    ChangeMade(CCleanupChange::eChangeBiomol);
                    break;
                case NCBI_BIOMOL(pre_RNA):
                case NCBI_BIOMOL(mRNA):
                case NCBI_BIOMOL(rRNA):
                case NCBI_BIOMOL(tRNA):
                case NCBI_BIOMOL(snRNA):
                case NCBI_BIOMOL(scRNA):
                case NCBI_BIOMOL(cRNA):
                case NCBI_BIOMOL(snoRNA):
                case NCBI_BIOMOL(transcribed_RNA):
                case NCBI_BIOMOL(ncRNA):
                case NCBI_BIOMOL(tmRNA):
                    SET_FIELD( bioseq.SetInst(), Mol, NCBI_SEQMOL(rna) );
                    ChangeMade(CCleanupChange::eChangeBiomol);
                    break;
                case NCBI_BIOMOL(peptide):
                    SET_FIELD( bioseq.SetInst(), Mol, NCBI_SEQMOL(aa) );
                    ChangeMade(CCleanupChange::eChangeBiomol);
                    break;
                case NCBI_BIOMOL(other_genetic):
                    SET_FIELD( bioseq.SetInst(), Mol, NCBI_SEQMOL(other) );
                    ChangeMade(CCleanupChange::eChangeBiomol);
                    break;
                case NCBI_BIOMOL(genomic_mRNA):
                    SET_FIELD( bioseq.SetInst(), Mol, NCBI_SEQMOL(na) );
                    ChangeMade(CCleanupChange::eChangeBiomol);
                    break;
                default:
                    break;
                }
            } else if( mol != NCBI_SEQMOL(rna) && 
                ( biomol == NCBI_BIOMOL(cRNA) || biomol == NCBI_BIOMOL(mRNA) ) ) 
            {
                SET_FIELD( bioseq.SetInst(), Mol, NCBI_SEQMOL(rna) );
                ChangeMade(CCleanupChange::eChangeBiomol);
            }
        }
    }
}

// return position of " [" + sOrganism + "]", but only if it's
// at the end and there are characters before it.
// Also, returns the position of the organelle prefix in the title.
static SIZE_TYPE s_TitleEndsInOrganism ( 
    const string & sTitle, 
    const string & sOrganism,
    SIZE_TYPE * out_piOrganellePos )
{
    if( out_piOrganellePos ) {
        *out_piOrganellePos = NPOS;
    }

    SIZE_TYPE answer = NPOS;

    const string sPattern = " [" + sOrganism + "]";
    if( NStr::EndsWith(sTitle, sPattern, NStr::eNocase) ) {
        answer = sTitle.length() - sPattern.length();
        if( answer < 1 ) {
            // title must have something before the pattern
            answer = NPOS;
        }
    } else {
        answer = NStr::FindNoCase(sTitle, sPattern, 0, NPOS, NStr::eLast);
        if (answer < 1 || answer == NPOS) {
            // pattern not found
            answer = NPOS;
        }
    }

    // find organelle prefix
    static const string kOrganellePrefixes[] = {
        " (chloroplast)",
        " (mitochondrion)"
    };
    if( out_piOrganellePos ) {

        static const unsigned int kOrganellePrefixes_len = 
            (sizeof(kOrganellePrefixes)/sizeof(kOrganellePrefixes[0]));
        for( unsigned int ii = 0; ii < kOrganellePrefixes_len; ++ii ) {
            const string & organelle_prefix = kOrganellePrefixes[ii];

            SIZE_TYPE possible_organelle_start_pos = NStr::Find (sTitle, organelle_prefix);
            if ( possible_organelle_start_pos != NPOS &&
                 NStr::EndsWith(CTempString(sTitle, 0, answer), organelle_prefix) ) {
                *out_piOrganellePos = possible_organelle_start_pos;
                break;
            }
        }
    }

    return answer;
}

void CNewCleanup_imp::x_AddPartialToProteinTitle( CBioseq &bioseq )
{
    // Bail if not protein
    if( ! FIELD_CHAIN_OF_2_IS_SET(bioseq, Inst, Mol) || 
        bioseq.GetInst().GetMol() != NCBI_SEQMOL(aa) ) 
    {
        return;
    }
 
    // Bail if record is swissprot
    FOR_EACH_SEQID_ON_BIOSEQ (seqid_itr, bioseq) {
        const CSeq_id& seqid = **seqid_itr;
        if( FIELD_IS(seqid, Swissprot) ) {
            return;
        }
    }

    static const char *kProteinOrganellePrefixes[] = {
        NULL, // unknown
        NULL, // genomic
        "chloroplast", // chloroplast
        NULL, // chromoplast
        NULL, // kinetoplast
        "mitochondrion", // mitochondrion
        NULL, // plastid
        NULL, // macronuclear
        NULL, // extrachrom
        NULL, // plasmid
        NULL, // transposon
        NULL, // insertion-seq
        NULL, // cyanelle
        NULL, // proviral
        NULL, // virion
        NULL, // nucleomorph
        NULL, // apicoplast
        NULL, // leucoplast
        NULL, // proplastid
        NULL, // endogenous-virus
        NULL, // hydrogenosome
        NULL, // chromosome
        NULL // chromatophore
    };

    // gather some info from the Seqdesc's on the bioseq, into
    // the following variables
    bool bPartial = false;
    string sTaxname;
    string sOldName;
    string *psTitle = NULL;
    const char *organelle = NULL;

    // iterate for title
    EDIT_EACH_SEQDESC_ON_BIOSEQ(descr_iter, bioseq) {
        CSeqdesc &descr = **descr_iter;
        if( descr.IsTitle() ) {
            psTitle = & GET_MUTABLE(descr, Title);
        }
    }
    // bail if no title
    if( (NULL == psTitle) || psTitle->empty() ) {
        return;
    }

    // iterate Seqdescs from bottom to top
    // accumulate seqdescs into here
    typedef vector< CConstRef<CSeqdesc> > TSeqdescVec;
    TSeqdescVec vecSeqdesc;
    {
        FOR_EACH_SEQDESC_ON_BIOSEQ(descr_iter, bioseq) {
            vecSeqdesc.push_back( CConstRef<CSeqdesc>( &**descr_iter ) );
        }
        // climb up to get parent Seqdescs
        CConstRef<CBioseq_set> bioseq_set( bioseq.GetParentSet() );
        for( ; bioseq_set; bioseq_set = bioseq_set->GetParentSet() ) {
            FOR_EACH_SEQDESC_ON_SEQSET(descr_iter, *bioseq_set) {
                vecSeqdesc.push_back( CConstRef<CSeqdesc>( &**descr_iter ) );
            }
        }
    }

    ITERATE(TSeqdescVec, descr_iter, vecSeqdesc) {
        const CSeqdesc &descr = **descr_iter;
        if( descr.IsMolinfo() && FIELD_IS_SET(descr.GetMolinfo(), Completeness) ) {
            switch( GET_FIELD(descr.GetMolinfo(), Completeness) ) {
                case NCBI_COMPLETENESS(partial):
                case NCBI_COMPLETENESS(no_left):
                case NCBI_COMPLETENESS(no_right):
                case NCBI_COMPLETENESS(no_ends):
                    bPartial = true;
                    break;
                default:
                    break;
            }
            // stop at first molinfo
            break; 
        }
    }

    ITERATE(TSeqdescVec, descr_iter, vecSeqdesc) {
        const CSeqdesc &descr = **descr_iter;
        if( descr.IsSource() ) {
            const TBIOSOURCE_GENOME genome = ( descr.GetSource().CanGetGenome() ?
                descr.GetSource().GetGenome() :
                NCBI_GENOME(unknown) );
            if (genome >= NCBI_GENOME(chloroplast) &&
                genome <= NCBI_GENOME(chromatophore) ) 
            {
                organelle = kProteinOrganellePrefixes[genome];
            }

            if( FIELD_IS_SET(descr.GetSource(), Org) ) {
                const COrg_ref & org = GET_FIELD(descr.GetSource(), Org);
                if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(org, Taxname) ) {
                    sTaxname = GET_FIELD(org, Taxname);
                }
                if ( NStr::StartsWith(sTaxname, organelle, NStr::eNocase) ) {
                    organelle = NULL;
                }
                FOR_EACH_ORGMOD_ON_ORGREF(mod_iter, org) {
                    const COrgMod & orgmod = **mod_iter;
                    if( FIELD_EQUALS(orgmod, Subtype, NCBI_ORGMOD(old_name) ) ) {
                        sOldName = GET_FIELD(orgmod, Subname);
                    }
                }
            }
            // stop at first source
            break;
        }
    }

    // put title into a reference, 
    // just because it's more convenient than a pointer
    string & sTitle = *psTitle;
    // remember original so we can see if we changed it
    const string sOriginalTitle = sTitle;

    // search for partial, must be just before bracketed organism
    SIZE_TYPE partialPos = NStr::Find(sTitle, ", partial [");
    if( partialPos == NPOS ) {
        partialPos = NStr::Find(sTitle, ", partial (");
    }

    // find oldname or taxname in brackets at end of protein title
    SIZE_TYPE penult = NPOS;
    SIZE_TYPE suffixPos = NPOS; // will point to " [${organism name}]" at end
    if ( ! sOldName.empty() && ! sTaxname.empty() ) {
        suffixPos = s_TitleEndsInOrganism (sTitle, sOldName, &penult);
    }
    if ( suffixPos == NPOS && ! sTaxname.empty() ) {
        suffixPos = s_TitleEndsInOrganism (sTitle, sTaxname, &penult);
        if (suffixPos != NPOS) {
            if (organelle == NULL && penult != NPOS) {
            } else if (organelle != NULL && penult == NPOS) {
            } else if ( penult != NPOS && sTitle.substr(penult) == organelle ) {
            } else {
                // bail if no need to change partial text or [organism name]
                if ( bPartial && partialPos != NPOS) {
                    return;
                } else if( ! bPartial && partialPos == NPOS ){
                    return;
                }
            }
        }
    }
    // do not change unless [genus species] was at the end
    if (suffixPos == NPOS) {
        return;
    }

    // truncate bracketed info from end of title, will replace with current taxname
    sTitle.resize( suffixPos );
    if (penult != NPOS) {
        sTitle.resize(penult);
    }

    // if ", partial [" was indeed just before the [genus species], it will now be ", partial"
    // Note: 9 is length of ", partial"
    if ( !bPartial  &&
         partialPos != string::npos  &&
         (partialPos == (sTitle.length() - 9)) ) 
    {
        sTitle.resize( partialPos );
    }
    NStr::TruncateSpacesInPlace( sTitle );

    //
    if( bPartial && partialPos == NPOS ) {
        sTitle += ", partial";
    }
    if (organelle != NULL) {
        sTitle += " (" + string(organelle) + ")";
    }
    if ( ! sTaxname.empty() ) {
        sTitle += " [" + sTaxname + "]";
    }

    if( sTitle != sOriginalTitle ) {
        ChangeMade(CCleanupChange::eCleanBioseqTitle);
    }
}

// returns empty string if there's a problem
string CNewCleanup_imp::x_ExtractSatelliteFromComment( string &comment )
{
    if( comment.empty() ) {
        return kEmptyStr;
    }

    string satellite_type;
    if ( NStr::StartsWith(comment, "microsatellite") ) { 
        satellite_type = "microsatellite";
    } else if ( NStr::StartsWith (comment, "minisatellite") ) {
        satellite_type = "minisatellite";
    } else if ( NStr::StartsWith (comment, "satellite") ) {
        satellite_type = "satellite";
    } else {
        return kEmptyStr;
    }

    string satellite_qual; // the answer
    if ( comment.length() == satellite_type.length() ) {
        comment.clear();
        ChangeMade(CCleanupChange::eRemoveComment);
        return satellite_type;
    } else if (comment[satellite_type.length()] == ';') {
        satellite_qual = satellite_type;
        comment = comment.substr( satellite_type.length() + 1 );
        NStr::TruncateSpacesInPlace(comment);
        ChangeMade(CCleanupChange::eChangeComment);
    }
    if ( comment [0] == '~' && comment [1] != '~') {
        comment [0] = ' ';
        NStr::TruncateSpacesInPlace(comment);
        ChangeMade(CCleanupChange::eChangeComment);
    }

    return satellite_qual;
}

// like C's function GetFrameFromLoc
static
int s_SetFrameFromLoc_Helper( const CSeq_loc &location, CRef<CScope> scope )
{
    const static int kCantFindFrame = 0;
    const static int kFrameOne      = 1;

    CSeq_loc_CI loc_ci( location, CSeq_loc_CI::eEmpty_Allow, CSeq_loc_CI::eOrder_Biological );

    // look at first part of the location
    CConstRef<CSeq_loc> first_loc = loc_ci.GetRangeAsSeq_loc();
    _ASSERT(first_loc);
    switch ( first_loc->Which() )
    {
    case NCBI_SEQLOC(Int):
        {
            const CSeq_interval & interval = first_loc->GetInt();
            if ( FIELD_EQUALS(interval, Strand, eNa_strand_minus) )
            {
                if( ! interval.IsSetFuzz_to() ) {
                    return kFrameOne;
                }
            }
            else if ( ! interval.IsSetFuzz_from() ) {
                return kFrameOne;
            }
        }
        break;
    case NCBI_SEQLOC(Pnt):
        {
            const CSeq_point & pnt = first_loc->GetPnt();
            if ( ! pnt.IsSetFuzz() ) {
                return kFrameOne;
            }
        }
        break;
    default:
        return kCantFindFrame;
    }

    // check the last part of the location
    CSeq_loc_CI last_ci = loc_ci;
    for( ; loc_ci ; ++loc_ci ) {
        last_ci = loc_ci;
    }

    CConstRef<CSeq_loc> last_loc = last_ci.GetRangeAsSeq_loc();
    _ASSERT(last_loc);
    switch ( last_loc->Which() )
    {
    case NCBI_SEQLOC(Int):
        {
            const CSeq_interval & interval = last_loc->GetInt();
            if ( FIELD_EQUALS(interval, Strand, eNa_strand_minus) )
            {
                if( interval.IsSetFuzz_from() ) {
                    return kCantFindFrame;
                }
            }
            else if ( interval.IsSetFuzz_to() )
                return kCantFindFrame;
        }
        break;
    case NCBI_SEQLOC(Pnt):
        {
            const CSeq_point & pnt = last_loc->GetPnt();
            if ( pnt.IsSetFuzz() )
                return kCantFindFrame;
        }
        break;
    default:
        return kCantFindFrame;
    }

    // have complete last codon, get frame 
    // from length
    return (1 + (sequence::GetLength(location, &*scope) % 3) );
}

void CNewCleanup_imp::x_SetFrameFromLoc( CCdregion &cdregion, const CSeq_loc &location )
{
    // TODO: Farther below is a simpler way to do this if we want to use
    // C++ functions, but since we want to match C for now, we use
    // somewhat more complex code that does what C does.

    TCDSFRAME_TYPE suggested_frame = NCBI_CDSFRAME(not_set);
    switch( s_SetFrameFromLoc_Helper(location, m_Scope) ) {
    case 0:
        suggested_frame = NCBI_CDSFRAME(not_set);
        break;
    case 1:
        suggested_frame = NCBI_CDSFRAME(one);
        break;
    case 2:
        suggested_frame = NCBI_CDSFRAME(two);
        break;
    case 3:
        suggested_frame = NCBI_CDSFRAME(three);
        break;
    default:
        // s_SetFrameFromLoc_Helper should only return 0, 1, 2 or 3
        _ASSERT(false);
        return;
    }

    if( suggested_frame == NCBI_CDSFRAME(not_set) ) {
        if( FIELD_IS_SET(cdregion, Frame) ) {
            RESET_FIELD(cdregion, Frame);
            ChangeMade(CCleanupChange::eChangeCdregion);
        }
    } else if( ! FIELD_EQUALS(cdregion, Frame, suggested_frame) ) {
        SET_FIELD(cdregion, Frame, suggested_frame);
        ChangeMade(CCleanupChange::eChangeCdregion);
    }

    //
    // potential future C++ code:
    //

    //if (! location.IsTruncatedStart(eExtreme_Biological) ) {
    //    cdregion.SetFrame( NCBI_CDSFRAME(one) );    // complete 5' end, it's frame 1
    //    return;
    //}

    // if( location.IsTruncatedStop(eExtreme_Biological) ) { 
    //    cdregion.ResetFrame();
    //    return;
    //}

    //const TSeqPos seq_len = sequence::GetLength(location, m_Scope);

    //// have complete last codon, get frame 
    //// from length
    //switch( (seq_len % 3) + 1 ) {
    //    case 1:
    //        cdregion.SetFrame( NCBI_CDSFRAME(one) );
    //        break;
    //    case 2:
    //        cdregion.SetFrame( NCBI_CDSFRAME(two) );
    //        break;
    //    case 3:
    //        cdregion.SetFrame( NCBI_CDSFRAME(three) );
    //        break;
    //    default:
    //        // mathematically impossible
    //        _ASSERT(false);
    //        return;
    //}
}

void CNewCleanup_imp::x_CleanupECNumber( string &ec_num )
{
    const string::size_type original_ec_num_length = ec_num.length();
    NStr::TruncateSpacesInPlace( ec_num );

    // remove any final periods
    string::size_type last_non_period = ec_num.find_last_not_of(".");
    if( last_non_period == string::npos ) {
        if( ! ec_num.empty() ) {
            ec_num.clear();
            ChangeMade(CCleanupChange::eCleanECNumber);
        }
        return;
    }
    ec_num.resize( last_non_period + 1 );

    // remove any unnecessary "EC " prefix
    s_RemoveInitial( ec_num, "EC ", NStr::eNocase );
    s_RemoveInitial( ec_num, "EC:", NStr::eNocase );

    if( ec_num.length() != original_ec_num_length ) {
        ChangeMade(CCleanupChange::eCleanECNumber);
    }
}

static bool s_ECNumberCanBeSplit( const string & ec_num )
{
    // check if string is non-empty and contains no forbidden characters
    if( ec_num.empty() ) {
        return false;
    }
    return ( ec_num.find_first_not_of("0123456789.-n ;") == string::npos );
}

void CNewCleanup_imp::x_CleanupECNumberList( CProt_ref::TEc & ec_num_list )
{
    // CProt_ref::TEc is a list, so the iterator stays valid even if we 
    // add new entries after the current one
    NON_CONST_ITERATE( CProt_ref::TEc, ec_num_iter, ec_num_list ) {
        string & ec_num = *ec_num_iter;
        x_CleanupECNumber( ec_num );
        if( s_ECNumberCanBeSplit(ec_num) ) {
            // if there are any, split at first ' ' or ';'
            string::size_type split_pos = ec_num.find_first_of(" ;");
            if( split_pos != string::npos ) {
                string new_ec_num = ec_num.substr( split_pos + 1 );
                ec_num.resize( split_pos );
                CProt_ref::TEc::iterator next_ec_num_iter = ec_num_iter;
                ++next_ec_num_iter;
                ec_num_list.insert( next_ec_num_iter, new_ec_num );
            }
        }
    }
}

void CNewCleanup_imp::x_CleanupAndRepairInference( string &inference )
{
    if( inference.empty() ) {
        return;
    }

    const string original_inference = inference;

    CRegexpUtil colonFixer( inference );
    colonFixer.Replace( "[ ]+:", ":" );
    colonFixer.Replace( ":*:[ ]+", ": ");
    colonFixer.GetResult().swap( inference ); // swap is faster than assignment

    // check if missing space after a prefix
    // e.g. "COORDINATES:foo" should become "COORDINATES: foo"
    CCachedRegexp spaceInserter = regexpCache.Get(
        "(COORDINATES|DESCRIPTION|EXISTENCE):[^ ]" );
    if( spaceInserter->IsMatch( inference ) ) {
        int location_just_beyond_match = spaceInserter->GetResults(0)[1];
        inference.insert( inference.begin() + location_just_beyond_match - 1, ' ' );
    }

    if( inference != original_inference ) {
        ChangeMade(CCleanupChange::eCleanQualifiers);
    }
}

// Yes, we copy dbname because we have to edit it.
static
void s_MatchesOfficialStructuredCommentDbname( string &tmp, string dbname )
{
    typedef SStaticPair<const char*, const char*>  TOfficialPrefixElem;
    static const TOfficialPrefixElem sc_official_prefix_map[] = {
        { "Assembly", "Assembly-Data" },
        { "Epiflu", "EpifluData" },
        { "Flu", "FluData" },
        { "Genome-Assembly", "Genome-Assembly-Data" },
        { "GISAID_EpiFlu(TM)", "GISAID_EpiFlu(TM)Data" },
        { "HIV-DataBase", "HIVDatabase" },
        { "HIVDataBase", "HIVDataBaseData" },
        { "International Barcode of Life (iBOL)",  "International Barcode of Life (iBOL)Data" },
        { "MIENS", "MIENS-Data" },
        { "MIGS", "MIGS-Data" },
        { "MIMARKS:3.0", "MIMARKS:3.0-Data" },
        { "MIMS", "MIMS-Data" }
    };
    typedef CStaticArrayMap<string, string, PNocase> TOfficialPrefixMap;
    DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TOfficialPrefixMap, sc_OfficialPrefixMap, sc_official_prefix_map);

    tmp.clear();

    s_RegexpReplace( dbname, "-?(Data)?$", "", 
        s_RegexpReplace_UnlimitedReplacements, 
        CRegexp::fCompile_ignore_case );

    TOfficialPrefixMap::const_iterator iter = sc_OfficialPrefixMap.find(dbname);
    if( iter != sc_OfficialPrefixMap.end() ) {
        tmp = iter->second;
    }
}

void s_StructuredCommentDbnameFromString( string &out_dbname, const string &field_str )
{
    out_dbname.clear();

    if ( field_str.empty() ) {
        return;
    }

    string::size_type after_hash_pos =  field_str.find_first_not_of("#");
    if( after_hash_pos == string::npos ) {
        // string is all hashes
        return;
    }

    out_dbname = field_str.substr( after_hash_pos );
    s_RegexpReplace( out_dbname, "(-END)?(-START)?#*$", "" );

    // correct for weirdnesses with -data for recognizable prefixes
    string tmp;
    s_MatchesOfficialStructuredCommentDbname (tmp, out_dbname);
    if ( ! tmp.empty() ) {
        out_dbname = tmp;
    }
}

static
int s_GetBarcodeOrder( const CRef<CUser_field> &field )
{
    typedef SStaticPair<const char*, int>  TBarcodeOrderElem;
    static const TBarcodeOrderElem sc_barcode_order_map[] = {
        { "Barcode Index Number", 2 },
        { "Order Assignment", 3 },
        { "StructuredCommentPrefix", 1 }, // must be first
        { "StructuredCommentSuffix", kMax_Int }, // must be last
        { "Tentative Name", 6 },
        { "iBOL Release Status", 5 },
        { "iBOL Working Group", 4 }
    };
    typedef CStaticArrayMap<string, int, PCase> TBarcodeOrderMap;
    DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TBarcodeOrderMap, sc_BarcodeOrderMap, sc_barcode_order_map);

    if( ! field || ! field->IsSetLabel() || ! field->GetLabel().IsStr() ) {
        // "-1" because we want the StructuredCommentSuffix to be last
        return (kMax_Int - 1);
    }

    const string & label_str = field->GetLabel().GetStr();

    TBarcodeOrderMap::const_iterator find_iter = sc_BarcodeOrderMap.find(label_str);
    if( find_iter == sc_BarcodeOrderMap.end() ) {
        // "-1" because we want the StructuredCommentSuffix to be last
        return (kMax_Int - 1);
    }

    return find_iter->second;
}

static
bool s_BarcodeCompare( 
    const CRef<CUser_field> &field1, 
    const CRef<CUser_field> &field2 ) 
{
    const int idx1 = s_GetBarcodeOrder( field1 );
    const int idx2 = s_GetBarcodeOrder( field2 );
    return idx1 < idx2;
}

void CNewCleanup_imp::x_CleanStructuredComment( CUser_object &user_object )
{
    if( ! FIELD_IS_SET_AND_IS(user_object, Type, Str) ||
        user_object.GetType().GetStr() != "StructuredComment" ) 
    {
        return;
    }

    bool genome_assembly_data = false;
    bool ibol_data = false;

    EDIT_EACH_USERFIELD_ON_USEROBJECT( user_field_iter, user_object ) {
        CUser_field &field = **user_field_iter;
        if( FIELD_IS_SET_AND_IS(field, Label, Str) && FIELD_IS_SET_AND_IS(field, Data, Str) ) {
            if( GET_FIELD(field.GetLabel(), Str) == "StructuredCommentPrefix" ) {
                string core;
                s_StructuredCommentDbnameFromString( core, GET_FIELD(field.GetData(), Str) );
                const string new_data_str = "##" + core + "-START##";
                if( ! FIELD_CHOICE_EQUALS(field, Data, Str, new_data_str) ) {
                    SET_FIELD(field.SetData(), Str, CUtf8::AsUTF8(new_data_str, eEncoding_Ascii) );
                    ChangeMade(CCleanupChange::eCleanUserObjectOrField);
                }
                if (core == "Genome-Assembly-Data") {
                    genome_assembly_data = true;
                } else if( core == "International Barcode of Life (iBOL)Data" ) {
                    ibol_data = true;
                }
            } else if ( GET_FIELD(field.GetLabel(), Str) == "StructuredCommentSuffix" ) {
                string core;
                s_StructuredCommentDbnameFromString( core, GET_FIELD(field.GetData(), Str) );
                const string new_data_str = "##" + core + "-END##";
                if( ! FIELD_CHOICE_EQUALS(field, Data, Str, new_data_str) ) {
                    SET_FIELD(field.SetData(), Str, CUtf8::AsUTF8(new_data_str, eEncoding_Ascii));
                    ChangeMade(CCleanupChange::eCleanUserObjectOrField);
                }
                if (core == "Genome-Assembly-Data") {
                    genome_assembly_data = true;
                } else if( core == "International Barcode of Life (iBOL)Data" ) {
                    ibol_data = true;
                }
            }
        }
    }

    if( genome_assembly_data ) {
        EDIT_EACH_USERFIELD_ON_USEROBJECT( user_field_iter, user_object ) {
            CUser_field &field = **user_field_iter;
            if( ! FIELD_IS_SET_AND_IS(field, Label, Str) ||
                ! FIELD_IS_SET_AND_IS(field, Data, Str) ) 
            {
                continue;
            }

            if( GET_FIELD( field.GetLabel(), Str) == "Finishing Goal" ||
                GET_FIELD( field.GetLabel(), Str) == "Current Finishing Status" )
            {

                string &field_str = GET_MUTABLE( field.SetData(), Str );
                if( field_str == "High Quality Draft" ) {
                    field_str = "High-Quality Draft";
                    ChangeMade(CCleanupChange::eCleanUserObjectOrField);
                } else if( field_str == "Improved High Quality Draft" ) {
                    field_str = "Improved High-Quality Draft";
                    ChangeMade(CCleanupChange::eCleanUserObjectOrField);
                } else if( field_str == "Annotation Directed" ) {
                    field_str = "Annotation-Directed Improvement";
                    ChangeMade(CCleanupChange::eCleanUserObjectOrField);
                } else if( field_str == "Non-contiguous Finished" ) {
                    field_str = "Noncontiguous Finished";
                    ChangeMade(CCleanupChange::eCleanUserObjectOrField);
                }
            }
            else if( GET_FIELD( field.GetLabel(), Str) == "Assembly Date" ) {
                string &field_str = GET_MUTABLE( field.SetData(), Str );
                bool ambiguous = false;
                string altered = CSubSource::FixDateFormat (field_str, true, ambiguous);
                if (!NStr::IsBlank(altered)) {
                    CRef<CDate> coll_date = CSubSource::DateFromCollectionDate (altered);
                    if (coll_date && coll_date->IsStd() && coll_date->GetStd().IsSetYear()) {
                        string day = "";
                        string month = "";
                        string year = "";
                        string new_date = "";
                        if (!ambiguous && coll_date->GetStd().IsSetDay()) {
                            coll_date->GetDate(&day, "%2D");
                        }
                        if (!ambiguous && coll_date->GetStd().IsSetMonth()) {
                            coll_date->GetDate(&month, "%N");
                            month = month.substr(0, 3);
                            NStr::ToUpper(month);
                        }
                        coll_date->GetDate(&year, "%Y");
                        if (!NStr::IsBlank(day)) {
                            new_date += day + "-";
                        }
                        if (!NStr::IsBlank(month)) {                           
                            new_date += month + "-";
                        }
                        if (!NStr::IsBlank(year)) {
                            new_date += year;
                        }
                        if (!NStr::Equal(field_str, new_date)) {
                            field_str = new_date;
                            ChangeMade(CCleanupChange::eCleanUserObjectOrField);
                        }
                    }
                }
            }
        }
    }

    if( ibol_data ) {
        if( ! USERFIELD_ON_USEROBJECT_IS_SORTED(user_object, s_BarcodeCompare) ) {
            SORT_USERFIELD_ON_USEROBJECT(user_object, s_BarcodeCompare);
            ChangeMade(CCleanupChange::eCleanUserObjectOrField);
        }
    }
}

void CNewCleanup_imp::x_MendSatelliteQualifier( string &val )
{
    if ( val.empty() ){
        return;
    }

    CCachedRegexp prefixRegexp = regexpCache.Get("^(micro|mini|)satellite");
    if( prefixRegexp->IsMatch(val) ) {
        SIZE_TYPE spot_just_after_match = prefixRegexp->GetResults(0)[1];
        if( spot_just_after_match < val.length() && 
            val[spot_just_after_match] == ' ' ) 
        {
            val[spot_just_after_match] = ':';
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }

        // remove spaces after first colon
        if( s_RegexpReplace( val, ":[ ]+", ":", 1 ) ) {
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    } else {
        NStr::TruncateSpacesInPlace( val, NStr::eTrunc_Begin );
        val = "satellite:" + val;
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
}

void CNewCleanup_imp::x_FixUpEllipsis( string &str )
{
    if( s_RegexpReplace( str, "[,.][,.][,.]$", "..." ) ) {
        ChangeMade(CCleanupChange::eChangeComment);
    }
}

void CNewCleanup_imp::x_RemoveFlankingQuotes( string &val )
{
    // holds the first and last pos that we will keep
    // (have to use "ints" since might be negative)
    int first_pos = 0;
    int last_pos = ( val.length() - 1 );

    // move inwards until there are no more quotes to trim
    for( ; first_pos <= last_pos ; ++first_pos, --last_pos ) {
        const char ch1 = val[first_pos];
        const char ch2 = val[last_pos];
        if( (ch1 != '\'' && ch1 != '\"') || ch1 != ch2 ) {
            break;
        }
    }

    // check if there was no change (this is the case almost always)
    if( 0 == first_pos ) {
        return;
    }

    // check if string is all nested quotes
    if( first_pos > last_pos ) {
        // Just clear it
        val.clear();
    } else {
        val = val.substr( first_pos, (last_pos - first_pos + 1) );
    }

    ChangeMade(CCleanupChange::eTrimFlankingQuotes);
}

static
bool s_IsIllegalQual( const string &qual )
{
    static const char * const sc_Illegal_qual_array[] = {
        "anticodon",
        "citation",
        "codon_start",
        "db_xref",
        "evidence",
        "exception",
        "gene",
        "note",
        "protein_id",
        "pseudo",
        "transcript_id",
        "transl_except",
        "transl_table",
        "translation"
    };
    typedef CStaticArraySet<const char*, PNocase_CStr> TIllegalQualSet;
    DEFINE_STATIC_ARRAY_MAP( TIllegalQualSet, sc_IllegalQualArray, sc_Illegal_qual_array );

    return ( sc_IllegalQualArray.find(qual.c_str()) != sc_IllegalQualArray.end() );
}

static bool s_GbQualCompare (
    const CRef<CGb_qual>& gb1,
    const CRef<CGb_qual>& gb2
)

{
    const CGb_qual& gbq1 = *(gb1);
    const CGb_qual& gbq2 = *(gb2);

    const string& ql1 = GET_FIELD (gbq1, Qual);
    const string& ql2 = GET_FIELD (gbq2, Qual);

    // legal quals first
    const bool is_illegal1 = s_IsIllegalQual(ql1);
    const bool is_illegal2 = s_IsIllegalQual(ql2);
    if( is_illegal1 && ! is_illegal2 ) {
        return false;
    } else if( ! is_illegal1 && is_illegal2 ) {
        return true;
    }

    int comp = s_CompareNoCaseCStyle(ql1, ql2);
    if (comp < 0) return true;
    if (comp > 0) return false;

    const string& vl1 = GET_FIELD (gbq1, Val);
    const string& vl2 = GET_FIELD (gbq2, Val);

    if (NStr::CompareNocase (vl1, vl2) < 0) return true;

    return false;
}

static bool s_GbQualEqual (
    const CRef<CGb_qual>& gb1,
    const CRef<CGb_qual>& gb2
)

{
    const CGb_qual& gbq1 = *(gb1);
    const CGb_qual& gbq2 = *(gb2);

    const string& ql1 = GET_FIELD (gbq1, Qual);
    const string& ql2 = GET_FIELD (gbq2, Qual);

    if (! NStr::EqualNocase (ql1, ql2)) return false;

    const string& vl1 = GET_FIELD (gbq1, Val);
    const string& vl2 = GET_FIELD (gbq2, Val);

    if (! NStr::EqualNocase (vl1, vl2)) return false;

    return true;
}

void CNewCleanup_imp::Except_textBC (
    string& except_text
)

{
    if (NStr::Find (except_text, "ribosome slippage") == NPOS &&
        NStr::Find (except_text, "trans splicing") == NPOS &&
        NStr::Find (except_text, "alternate processing") == NPOS &&
        NStr::Find (except_text, "adjusted for low quality genome") == NPOS &&
        NStr::Find (except_text, "non-consensus splice site") == NPOS) {
        return ;
    }

    vector<string> exceptions;
    NStr::Tokenize (except_text, ",", exceptions);

    EDIT_EACH_STRING_IN_VECTOR (it, exceptions) {
        string& text = *it;
        size_t tlen = text.length();
        NStr::TruncateSpacesInPlace (text);
        if (text.length() != tlen) {
            ChangeMade (CCleanupChange::eTrimSpaces);
        }
        if (! text.empty()) {
            if (text == "ribosome slippage") {
                text = "ribosomal slippage";
                ChangeMade (CCleanupChange::eChangeException);
            } else if (text == "trans splicing") {
                text = "trans-splicing";
                ChangeMade (CCleanupChange::eChangeException);
            } else if (text == "alternate processing") {
                text = "alternative processing";
                ChangeMade (CCleanupChange::eChangeException);
            } else if (text == "adjusted for low quality genome") {
                text = "adjusted for low-quality genome";
                ChangeMade (CCleanupChange::eChangeException);
            } else if (text == "non-consensus splice site") {
                text = "nonconsensus splice site";
                ChangeMade (CCleanupChange::eChangeException);
            }
        }
    }

    except_text = NStr::Join (exceptions, ", ");
}

static
bool s_SeqLocAnyNull( const CSeq_loc & loc )
{
    CSeq_loc_CI loc_ci( loc, CSeq_loc_CI::eEmpty_Allow);
    for( ; loc_ci; ++loc_ci ) {
        const CSeq_loc& loc_piece = loc_ci.GetEmbeddingSeq_loc();
        if( loc_piece.IsNull() ) {
            return true;
        }
    }

    return false;
}

void CNewCleanup_imp::SeqfeatBC (
    CSeq_feat& sf
)

{
    // note - need to clean up GBQuals before dbxrefs, because they may be converted to populate other fields

    // sort/unique gbquals

    if (! GBQUAL_ON_SEQFEAT_IS_SORTED (sf, s_GbQualCompare)) {
        SORT_GBQUAL_ON_SEQFEAT (sf, s_GbQualCompare);
        ChangeMade (CCleanupChange::eCleanQualifiers);
    }

    if (! GBQUAL_ON_SEQFEAT_IS_UNIQUE (sf, s_GbQualEqual)) {
        UNIQUE_GBQUAL_ON_SEQFEAT (sf, s_GbQualEqual);
        ChangeMade (CCleanupChange::eRemoveQualifier);
    }

    EDIT_EACH_GBQUAL_ON_SEQFEAT (gbq_it, sf) {
        CGb_qual& gbq = **gbq_it;
        GBQualBC(gbq);
        if( GBQualSeqFeatBC(gbq, sf) == eAction_Erase ) 
        {
            ERASE_GBQUAL_ON_SEQFEAT (gbq_it, sf);
            ChangeMade (CCleanupChange::eRemoveQualifier);
        }
    }

    CLEAN_STRING_MEMBER (sf, Title);

    if( FIELD_EQUALS( sf, Except, false ) ) {
        RESET_FIELD( sf, Except );
        ChangeMade (CCleanupChange::eRemoveException);
    }

    if( FIELD_EQUALS( sf, Pseudo, false ) ) {
        RESET_FIELD( sf, Pseudo );
        ChangeMade (CCleanupChange::eRemoveException);
    }

    if( FIELD_EQUALS( sf, Partial, false ) ) {
        RESET_FIELD( sf, Partial );
        ChangeMade (CCleanupChange::eRemoveException);
    }

    CLEAN_STRING_MEMBER (sf, Except_text);
    if (FIELD_IS_SET (sf, Except_text)) {
        string &et = GET_MUTABLE (sf, Except_text);
        Except_textBC (et);
        if( FIELD_EQUALS(sf, Except, true) && FIELD_EQUALS(sf, Comment, et) ) {
            RESET_FIELD(sf, Comment);
            ChangeMade (CCleanupChange::eRemoveComment);
        }
    }

    vector< CRef< CDbtag > > new_dbtags;
    EDIT_EACH_DBXREF_ON_SEQFEAT (dbx_it, sf) {
        CDbtag& dbt = **dbx_it;
        x_SplitDbtag(dbt, new_dbtags );
    }
    if( ! new_dbtags.empty() ) {
        copy( new_dbtags.begin(), new_dbtags.end(), back_inserter(sf.SetDbxref()) );
    }

    CALL_IF_SET( PubSetBC, sf, Cit );
}

void CNewCleanup_imp::x_PostSeqFeat( CSeq_feat& sf )
{
    // need to clean this up in case it was changed by our children
    CLEAN_STRING_MEMBER (sf, Comment);
    CALL_IF_SET( CleanDoubleQuote, sf, Comment );
    if ( STRING_FIELD_MATCH( sf, Comment, "." ) ) {
        RESET_FIELD (sf, Comment);
        ChangeMade (CCleanupChange::eChangeComment);
    }

    // sort/unique gbquals (yes, must do before *and* after )
    if (! GBQUAL_ON_SEQFEAT_IS_SORTED (sf, s_GbQualCompare)) {
        SORT_GBQUAL_ON_SEQFEAT (sf, s_GbQualCompare);
        ChangeMade (CCleanupChange::eCleanQualifiers);
    }
    if (! GBQUAL_ON_SEQFEAT_IS_UNIQUE (sf, s_GbQualEqual)) {
        UNIQUE_GBQUAL_ON_SEQFEAT (sf, s_GbQualEqual);
        ChangeMade (CCleanupChange::eRemoveQualifier);
    }
    REMOVE_IF_EMPTY_GBQUAL_ON_SEQFEAT(sf);

    EDIT_EACH_DBXREF_ON_SEQFEAT (dbx_it, sf) {
        CDbtag& dbt = **dbx_it;
        if (s_DbtagIsBad (dbt)) {
            ERASE_DBXREF_ON_SEQFEAT (dbx_it, sf);
            ChangeMade (CCleanupChange::eCleanDbxrefs);
        }
    }

    // sort/unique db_xrefs
    if (! DBXREF_ON_SEQFEAT_IS_SORTED (sf, s_DbtagCompare)) {
        SORT_DBXREF_ON_SEQFEAT (sf, s_DbtagCompare);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }
    if (! DBXREF_ON_SEQFEAT_IS_UNIQUE (sf, s_DbtagEqual)) {
        UNIQUE_DBXREF_ON_SEQFEAT (sf, s_DbtagEqual);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }
    REMOVE_IF_EMPTY_DBXREF_ON_SEQFEAT( sf );

    // xrefs
    EDIT_EACH_SEQFEATXREF_ON_SEQFEAT( xref_iter, sf ) {
        CSeqFeatXref &xref = **xref_iter;
        if( ! FIELD_IS_SET(xref, Id) && ! FIELD_IS_SET(xref, Data) ) {
            ERASE_SEQFEATXREF_ON_SEQFEAT(xref_iter, sf);
            ChangeMade (CCleanupChange::eCleanSeqFeatXrefs);
        }
    }
    REMOVE_IF_EMPTY_SEQFEATXREF_ON_SEQFEAT( sf );

    // clean up partial flag
    const unsigned int partial_loc_mask = ( 
        sequence::eSeqlocPartial_Start      | 
        sequence::eSeqlocPartial_Stop       );
    const unsigned int partial_loc = 
        sequence::SeqLocPartialCheck( GET_FIELD( sf, Location ), m_Scope );
    if ( FIELD_EQUALS(sf, Partial, true) ) {
        // do nothing, will not change partial if already set
    } else if ( (partial_loc & partial_loc_mask) || ( s_SeqLocAnyNull( GET_FIELD( sf, Location ) ) && ! m_SeqEntryInfoStack.top().m_IsEmblOrDdbj) ) {
        SET_FIELD( sf, Partial, true );
        ChangeMade (CCleanupChange::eChangePartial);
    }
}

static bool
s_GeneSynCompareCS(
    const string &syn1,
    const string &syn2 )
{
    return ( syn1 < syn2 );
}

static bool
s_GeneSynEqual(
    const string &syn1,
    const string &syn2 )
{
    return syn1 == syn2;
}

// CILCFirst stands for "case-insensitive, lower-case first"
static bool
s_GeneSynCompareCILCFirst(
    const string &syn1,
    const string &syn2 )
{
    int nocase_compare = s_CompareNoCaseCStyle( syn1, syn2 );
    if( nocase_compare != 0 ) {
        return nocase_compare < 0;
    }

    // notice reversal, so that lowercase is first
    return ( syn2 < syn1 );
}

class CStringIsEmpty
{
public:
    bool operator()( const string &str ) const { return str.empty(); }
};

// returns true if a split was done and added to gene_syns_to_add
// gene_syns_to_add is unaffected if syn was not split.
bool s_SplitGeneSyn( const string &syn, vector<string> &gene_syns_to_add)
{
    // preliminary quick-test
    if( syn.find_first_of(",;") == NPOS ) {
        return false;
    }

    // split by comma
    vector<string> pieces_split_by_comma;
    NStr::Tokenize( syn, ",", pieces_split_by_comma );

    // now split each of those pieces by "; "
    vector<string> pieces_split_by_semicolon;
    FOR_EACH_STRING_IN_VECTOR( piece_iter, pieces_split_by_comma ) {
        NStr::TokenizePattern( *piece_iter, "; ", pieces_split_by_semicolon );
    }

    if( pieces_split_by_semicolon.size() > 1 ) {
        // copy non-empty pieces, trimming as we go
        EDIT_EACH_STRING_IN_VECTOR( piece_iter, pieces_split_by_semicolon ) {
            CleanVisString( *piece_iter );
            if( ! piece_iter->empty() ) {
                gene_syns_to_add.push_back(*piece_iter);
            }
        }
        return true;
    } else {
        return false;
    }
}

void CNewCleanup_imp::GenerefBC (
    CGene_ref& gr
)

{
    // split gene synonyms that have a comma or "; "
    vector<string> gene_syns_to_add;
    EDIT_EACH_SYNONYM_ON_GENEREF (syn_itr, gr) {
        string& syn = *syn_itr;
        if( s_SplitGeneSyn(syn, gene_syns_to_add) ) {
            ERASE_SYNONYM_ON_GENEREF (syn_itr, gr);
            ChangeMade (CCleanupChange::eChangeGeneRef);
        }
    }
    if( ! gene_syns_to_add.empty() ) {
        copy( gene_syns_to_add.begin(), gene_syns_to_add.end(), 
            back_inserter(gr.SetSyn()) );
        ChangeMade (CCleanupChange::eChangeGeneRef);
    }

    if( ! SYNONYM_ON_GENEREF_IS_SORTED(gr, s_GeneSynCompareCS) ) {
        SORT_SYNONYM_ON_GENEREF( gr, s_GeneSynCompareCS );
        ChangeMade (CCleanupChange::eChangeGeneRef);
    }
    if (! SYNONYM_ON_GENEREF_IS_UNIQUE (gr, s_GeneSynEqual)) {
        UNIQUE_SYNONYM_ON_GENEREF(gr, s_GeneSynEqual);
        ChangeMade (CCleanupChange::eChangeGeneRef);
    }
    if( ! SYNONYM_ON_GENEREF_IS_SORTED(gr, s_GeneSynCompareCILCFirst) ) {
        SORT_SYNONYM_ON_GENEREF( gr, s_GeneSynCompareCILCFirst );
        ChangeMade (CCleanupChange::eChangeGeneRef);
    }

    // remove synonyms equal to locus
    if (! FIELD_IS_SET (gr, Locus)) return;
    const string& locus = GET_FIELD (gr, Locus);

    EDIT_EACH_SYNONYM_ON_GENEREF (syn_itr, gr) {
        string& syn = *syn_itr;
        if (NStr::EqualNocase (locus, syn)) {
            ERASE_SYNONYM_ON_GENEREF (syn_itr, gr);
            ChangeMade (CCleanupChange::eChangeGeneRef);
        }
    }

    // remove obsolete or otherwise stale dbxrefs
    EDIT_EACH_DBXREF_ON_GENEREF(it, gr) {
        CDbtag& dbt = **it;
        if (s_DbtagIsBad (dbt)) {
            ERASE_DBXREF_ON_GENEREF (it, gr);
            ChangeMade (CCleanupChange::eCleanDbxrefs);
        }
    }

    // sort/uniq dbxrefs on generef
    if( ! DBXREF_ON_GENEREF_IS_SORTED(gr, s_DbtagCompare) ) {
        SORT_DBXREF_ON_GENEREF(gr, s_DbtagCompare);
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    }
    if( ! DBXREF_ON_GENEREF_IS_UNIQUE(gr, s_DbtagEqual) ) {
        UNIQUE_DBXREF_ON_GENEREF(gr, s_DbtagEqual);
        ChangeMade(CCleanupChange::eRemoveGeneXref);
    }
}

static bool s_IsEmptyGeneRef (const CGene_ref& gr)

{
    if (FIELD_IS_SET (gr, Locus)) return false;
    if (FIELD_IS_SET (gr, Allele)) return false;
    if (FIELD_IS_SET (gr, Desc)) return false;
    if (FIELD_IS_SET (gr, Maploc)) return false;
    if (FIELD_IS_SET (gr, Db)) return false;
    if (FIELD_IS_SET (gr, Syn)) return false;
    if (FIELD_IS_SET (gr, Locus_tag)) return false;

    return true;
}

static bool s_CommentRedundantWithGeneRef (
    CGene_ref& gene_ref,
    const string& comm
)

{
    if (STRING_FIELD_MATCH (gene_ref, Locus_tag, comm)) return true;
    if (STRING_SET_MATCH   (gene_ref, Syn,       comm)) return true;

    return false;
}

static
CRef<CDbtag> s_DbtagParse( const string &dbtag_str )
{
    CRef<CDbtag> result( new CDbtag );

    string id_str;
    if( ! NStr::SplitInTwo(dbtag_str, ":", result->SetDb(), id_str ) ) {
        return CRef<CDbtag>();
    }

    // checks if a string is all digits
    int id = 0;
    // Note: assignment in "if"
    if( s_IsAllDigits(id_str) && 
        (id = NStr::StringToInt(id_str, NStr::fConvErr_NoThrow)) > 0 )
    {
        result->SetTag().SetId( id );
    } else {
        result->SetTag().SetStr().swap( id_str );
    }

    return result;
}

static
CConstRef<CUser_object> s_FindUserObjectTypeRecursive( const CUser_object &user_obj, const string &sought_type_label );

static 
CConstRef<CUser_object> s_FindUserObjectTypeRecursive_helper( const CUser_field &field, const string &sought_type_label )
{
    if( FIELD_IS_SET(field, Data) ) {
        switch( GET_FIELD(field, Data).Which() ) {
            case CUser_field::C_Data::e_Object:
                return s_FindUserObjectTypeRecursive( GET_FIELD(field, Data).GetObject(), sought_type_label );
                break;
            case CUser_field::C_Data::e_Fields:
                ITERATE( CUser_field::C_Data::TFields, field_iter, GET_FIELD(field, Data).GetFields() ) {
                    CConstRef<CUser_object> result = s_FindUserObjectTypeRecursive_helper( **field_iter, sought_type_label );
                    if( result ) {
                        return result;
                    }
                }
                break;
            case CUser_field::C_Data::e_Objects:
                ITERATE( CUser_field::C_Data::TObjects, obj_iter, GET_FIELD(field, Data).GetObjects() ) {
                    CConstRef<CUser_object> result = s_FindUserObjectTypeRecursive( **obj_iter, sought_type_label );
                    if( result ) {
                        return result;
                    }
                }
                break;
            default:
                break;
        }
    }

    return CConstRef<CUser_object>();
}

static
CConstRef<CUser_object> s_FindUserObjectTypeRecursive( const CUser_object &user_obj, const string &sought_type_label )
{
    // is the one we're given a match?
    if( FIELD_IS_SET_AND_IS(user_obj, Type, Str) && user_obj.GetType().GetStr() == "OfficialNomenclature" ) {
        return CConstRef<CUser_object>( &user_obj );
    }

    // otherwise, recurse downwards depth-first
    FOR_EACH_USERFIELD_ON_USEROBJECT(field_iter, user_obj) {
        CConstRef<CUser_object> result = s_FindUserObjectTypeRecursive_helper( **field_iter, sought_type_label );
        if( result ) {
            return result;
        }
    }

    return CConstRef<CUser_object>();
}

void CNewCleanup_imp::GeneFeatBC (
    CGene_ref& gene_ref,
    CSeq_feat& seq_feat
)

{
    // move gene.pseudo to feat.pseudo
    if (FIELD_IS_SET (gene_ref, Pseudo)) {
        if( GET_FIELD(gene_ref, Pseudo) ) {
            SET_FIELD (seq_feat, Pseudo, true);
            RESET_FIELD (gene_ref, Pseudo);
            ChangeMade (CCleanupChange::eChangeQualifiers);
        } else {
            RESET_FIELD(seq_feat, Pseudo);
            RESET_FIELD(gene_ref, Pseudo);
            ChangeMade (CCleanupChange::eChangeQualifiers);
        }
    }

    // remove feat.comment if equal to various gene fields
    if (FIELD_IS_SET (seq_feat, Comment)) {
        if (s_CommentRedundantWithGeneRef (gene_ref, GET_FIELD (seq_feat, Comment))) {
            RESET_FIELD (seq_feat, Comment);
            ChangeMade (CCleanupChange::eChangeComment);
        }
    }
        
    // move gene.db to feat.dbxref
    if (GENEREF_HAS_DBXREF (gene_ref) ) {
        FOR_EACH_DBXREF_ON_GENEREF (db_itr, gene_ref) {
            CRef <CDbtag> dbc (*db_itr);
            ADD_DBXREF_TO_SEQFEAT (seq_feat, dbc);
        }
        RESET_FIELD (gene_ref, Db);
        ChangeMade (CCleanupChange::eChangeDbxrefs);
    }
        
    // move feat.xref.gene.db to feat.dbxref
    if (SEQFEAT_HAS_SEQFEATXREF (seq_feat)) {
        EDIT_EACH_SEQFEATXREF_ON_SEQFEAT (xr_itr, seq_feat) {
            CSeqFeatXref& sfx = **xr_itr;
            if (! FIELD_IS_SET (sfx, Data)) continue;
            CSeqFeatData& sfd = GET_MUTABLE (sfx, Data);
            if (! FIELD_IS (sfd, Gene)) continue;
            CGene_ref& gene_ref = GET_MUTABLE (sfd, Gene);
            if (GENEREF_HAS_DBXREF (gene_ref)) {
                FOR_EACH_DBXREF_ON_GENEREF (db_itr, gene_ref) {
                    CRef <CDbtag> dbc (*db_itr);
                    ADD_DBXREF_TO_SEQFEAT (seq_feat, dbc);
                }
                RESET_FIELD (gene_ref, Db);
                ChangeMade (CCleanupChange::eChangeDbxrefs);
            }
            if (s_IsEmptyGeneRef (gene_ref)) {
                ERASE_SEQFEATXREF_ON_SEQFEAT (xr_itr, seq_feat);
                ChangeMade (CCleanupChange::eChangeDbxrefs);
            }
        }
    }

    REMOVE_IF_EMPTY_SEQFEATXREF_ON_SEQFEAT(seq_feat);

    // ModernizeGeneFields
    // (that is, create a formal_name from User-objects, if possible)
    if( ! FIELD_IS_SET(gene_ref, Formal_name) && FIELD_IS_SET(seq_feat, Ext)) {
        CConstRef<CUser_object> user_obj_ref = s_FindUserObjectTypeRecursive( GET_FIELD(seq_feat, Ext), "OfficialNomenclature" );
        // FIELD_IS_SET_AND_IS(user_obj, Type, Str) && user_obj.GetType().GetStr() == "OfficialNomenclature"

        if( user_obj_ref ) {
            const CUser_object &user_obj = *user_obj_ref;

            const string *symbol = NULL;
            const string *name = NULL;
            const string *source = NULL;
            CGene_nomenclature::EStatus status = CGene_nomenclature::eStatus_unknown;

            FOR_EACH_USERFIELD_ON_USEROBJECT(user_field_iter, user_obj) {
                const CUser_field &user_field = **user_field_iter;
                if( FIELD_IS_SET_AND_IS(user_field, Label, Str) && FIELD_IS_SET_AND_IS(user_field, Data, Str) ) {
                    const string &label_str = GET_FIELD(user_field.GetLabel(), Str);
                    const string &data_str = GET_FIELD(user_field.GetData(), Str);

                    if( NStr::EqualNocase(label_str, "Symbol") ) {
                        symbol = &data_str;
                    } else if( NStr::EqualNocase(label_str, "Name") ) {
                        name = &data_str;
                    } else if( NStr::EqualNocase(label_str, "DataSource") ) {
                        source = &data_str;
                    } else if( NStr::EqualNocase(label_str, "Status") ) {
                        if( NStr::EqualNocase(data_str, "Official") ) {
                            status = CGene_nomenclature::eStatus_official;
                        } else if( NStr::EqualNocase(data_str, "Interim") ) {
                            status = CGene_nomenclature::eStatus_interim;
                        }
                    } 
                }
            }

            if( (symbol != NULL) || (name != NULL) || (source != NULL) || 
                (status != CGene_nomenclature::eStatus_unknown) ) 
            {
                CGene_nomenclature &gene_nomenclature = GET_MUTABLE(gene_ref, Formal_name);
                if( symbol != NULL ) {
                    gene_nomenclature.SetSymbol(*symbol);
                }
                if( name != NULL ) {
                    gene_nomenclature.SetName(*name);
                }
                if( source != NULL ) {
                    // parse "source" string into a CDbtag
                    CRef<CDbtag> new_dbtag = s_DbtagParse( *source );
                    if( new_dbtag ) {
                        gene_nomenclature.SetSource(*new_dbtag);
                    }
                }
                gene_nomenclature.SetStatus(status);

                if( &GET_FIELD(seq_feat, Ext) == user_obj_ref ) {
                    RESET_FIELD(seq_feat, Ext);
                }

                ChangeMade(CCleanupChange::eCreateGeneNomenclature);
            }
        }
    }
}

void CNewCleanup_imp::ProtrefBC (
    CProt_ref& prot_ref
)

{
    // "not set" should just be removed
    if( FIELD_EQUALS(prot_ref, Processed, NCBI_PROTREF(not_set) ) ) {
        RESET_FIELD(prot_ref, Processed);
        ChangeMade(CCleanupChange::eRemoveQualifier);
    }

    CLEAN_STRING_MEMBER (prot_ref, Desc);
    if (CleanStringList (GET_MUTABLE (prot_ref, Name))) {
        ChangeMade (CCleanupChange::eChangeProtNames);
    }
    REMOVE_IF_EMPTY_NAME_ON_PROTREF(prot_ref);

    CLEAN_STRING_LIST_JUNK (prot_ref, Ec);
    CLEAN_STRING_LIST (prot_ref, Activity);

    UNIQUE_WITHOUT_SORT_ACTIVITY_ON_PROTREF( prot_ref, PNocase );

    REMOVE_IF_EMPTY_ACTIVITY_ON_PROTREF(prot_ref);

    // rubisco cleanup
    if( m_SeqEntryInfoStack.top().m_IsEmblOrDdbj ) {
        EDIT_EACH_NAME_ON_PROTREF (it, prot_ref) {
            if (NStr::EqualNocase (*it, "RbcL") || NStr::EqualNocase(*it, "rubisco large subunit")) {
                *it = "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit";
                ChangeMade (CCleanupChange::eChangeQualifiers);
                if (prot_ref.IsSetDesc() && NStr::EqualNocase(prot_ref.GetDesc(), "RbcL")) {
                    prot_ref.ResetDesc();
                }
                continue;
            } else if (NStr::EqualNocase (*it, "RbcS") || NStr::EqualNocase(*it, "rubisco small subunit")) {
                *it = "ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit";
                ChangeMade (CCleanupChange::eChangeQualifiers);
                if (prot_ref.IsSetDesc() && NStr::EqualNocase(prot_ref.GetDesc(), "RbcS")) {
                    prot_ref.ResetDesc();
                }
                continue;
            } 

            // This is pretty inefficient, so when there's time we should replace it with a map or something
            if (NStr::Find (*it, "ribulose") != string::npos
                && NStr::Find (*it, "bisphosphate") != string::npos
                && NStr::Find (*it, "methyltransferase") == string::npos
                && !NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit")
                && !NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit")
                && (NStr::EqualNocase (*it, "ribulose 1,5-bisphosphate carboxylase/oxygenase large subunit")
                || NStr::EqualNocase (*it, "ribulose 1,5-bisphosphate carboxylase large subunit")
                || NStr::EqualNocase (*it, "ribulose bisphosphate carboxylase large subunit")
                || NStr::EqualNocase (*it, "ribulose-bisphosphate carboxylase large subunit")
                || NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase large subunit")
                || NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase, large subunit")
                || NStr::EqualNocase (*it, "large subunit of ribulose-1,5-bisphosphate carboxylase/oxygenase")
                || NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase oxygenase large subunit")
                || NStr::EqualNocase (*it, "ribulose bisphosphate carboxylase large chain")
                || NStr::EqualNocase (*it, "ribulose 1,5-bisphosphate carboxylase-oxygenase large subunit")
                || NStr::EqualNocase (*it, "ribulose bisphosphate carboxylase oxygenase large subunit")
                || NStr::EqualNocase (*it, "ribulose 1,5 bisphosphate carboxylase large subunit")
                || NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase/oxygenase, large subunit")
                || NStr::EqualNocase (*it, "large subunit of ribulose-1,5-bisphosphate carboxylase/oxgenase")
                || NStr::EqualNocase (*it, "ribulose bisphosphate carboxylase/oxygenase large subunit")
                || NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase oxygenase, large subunit")
                || NStr::EqualNocase (*it, "ribulose 5-bisphosphate carboxylase, large subunit")
                || NStr::EqualNocase (*it, "ribulosebisphosphate carboxylase large subunit")
                || NStr::EqualNocase (*it, "ribulose bisphosphate large subunit")
                || NStr::EqualNocase (*it, "ribulose 1,5 bisphosphate carboxylase/oxygenase large subunit")
                || NStr::EqualNocase (*it, "ribulose 1,5-bisphosphate carboxylase/oxygenase large chain")
                || NStr::EqualNocase (*it, "large subunit ribulose-1,5-bisphosphate carboxylase/oxygenase")
                || NStr::EqualNocase (*it, "ribulose-bisphosphate carboxylase, large subunit")
                || NStr::EqualNocase (*it, "ribulose-1, 5-bisphosphate carboxylase/oxygenase large-subunit")) ) {
                    *it = "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit";
                    ChangeMade (CCleanupChange::eChangeQualifiers);
            }
        }
    }
}

static const char* const uninf_names [] = {
    "peptide",
    "putative",
    "signal",
    "signal peptide",
    "signal-peptide",
    "signal_peptide",
    "transit",
    "transit peptide",
    "transit-peptide",
    "transit_peptide",
    "unknown",
    "unnamed"
};

typedef CStaticArraySet<string, PNocase> TUninformative;
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TUninformative, sc_UninfNames, uninf_names);

static bool s_IsInformativeName (
    const string& name
)

{
    return ! name.empty() && sc_UninfNames.find(name) == sc_UninfNames.end();
}

static bool s_CommentRedundantWithProtRef (
    CProt_ref& pr,
    const string& comm
)

{
    if (STRING_SET_MATCH (pr, Name, comm)) return true;
    if (STRING_FIELD_MATCH (pr, Desc, comm)) return true;
    if (STRING_SET_MATCH (pr, Ec, comm)) return true;

    return false;
}

void CNewCleanup_imp::ProtFeatfBC (
    CProt_ref& pr,
    CSeq_feat& sf
)

{
    const TPROTREF_PROCESSED processed = ( FIELD_IS_SET (pr, Processed) ?
        GET_FIELD (pr, Processed) :
        NCBI_PROTREF(not_set) );

    // move putative from comment to protein name for mat peptide
    if (FIELD_IS_SET (sf, Comment) && 
        RAW_FIELD_IS_EMPTY_OR_UNSET(pr, Name) &&
        processed != NCBI_PROTREF(signal_peptide) &&
        processed != NCBI_PROTREF(transit_peptide)) {
            if (! NStr::EqualNocase ("putative", GET_FIELD (sf, Comment))) {
                ADD_NAME_TO_PROTREF ( pr, GET_FIELD (sf, Comment) );
                ChangeMade(CCleanupChange::eChangeProtNames);
                RESET_FIELD (sf, Comment);
                ChangeMade(CCleanupChange::eRemoveComment);
            }
    }

    // move putative to comment, remove uninformative name of signal peptide
    if (FIELD_IS_SET (pr, Name)) {
        if (processed == NCBI_PROTREF(signal_peptide) ||
            processed == NCBI_PROTREF(transit_peptide)) {
                EDIT_EACH_NAME_ON_PROTREF (nm_itr, pr) {
                    string& str = *nm_itr;
                    if (NStr::Find (str, "putative") != NPOS ||
                        NStr::Find (str, "put. ") != NPOS) {
                            if (! FIELD_IS_SET (sf, Comment)) {
                                SET_FIELD (sf, Comment, "putative");
                                ChangeMade (CCleanupChange::eChangeComment);
                            }
                    }
                    if (! s_IsInformativeName (str)) {
                        ERASE_NAME_ON_PROTREF (nm_itr, pr);
                        ChangeMade (CCleanupChange::eChangeProtNames);
                    }
                }
        }

        EDIT_EACH_NAME_ON_PROTREF (nm_itr, pr) {
            string& str = *nm_itr;
            // rubisco
            if (NStr::EqualNocase (str, "RbcL") || NStr::EqualNocase(str, "rubisco large subunit")) {
                str = "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit";
                ChangeMade (CCleanupChange::eChangeQualifiers);
            } else if (NStr::EqualNocase (str, "RbcS") || NStr::EqualNocase(str, "rubisco small subunit")) {
                str = "ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit";
                ChangeMade (CCleanupChange::eChangeQualifiers);
            }
        }
    }

    // add unnamed as default protein name
    if ( RAW_FIELD_IS_EMPTY_OR_UNSET(pr, Name) ) {
        if (processed == NCBI_PROTREF(preprotein)  ||  
            processed == NCBI_PROTREF(mature)) {
                ADD_NAME_TO_PROTREF (pr, "unnamed");
                ChangeMade (CCleanupChange::eChangeQualifiers);
        }
    }

    // remove feat.comment if equal to various protein fields
    if (FIELD_IS_SET (sf, Comment)) {
        if (s_CommentRedundantWithProtRef (pr, GET_FIELD (sf, Comment))) {
            RESET_FIELD (sf, Comment);
            ChangeMade (CCleanupChange::eChangeComment);
        }
    }
        
    // move prot.db to feat.dbxref
    if (PROTREF_HAS_DBXREF (pr)) {
        FOR_EACH_DBXREF_ON_PROTREF (db_itr, pr) {
            CRef <CDbtag> dbc (*db_itr);
            ADD_DBXREF_TO_SEQFEAT (sf, dbc);
        }
        RESET_FIELD (pr, Db);
        ChangeMade (CCleanupChange::eChangeDbxrefs);
    }

    REMOVE_IF_EMPTY_NAME_ON_PROTREF(pr);
}

void CNewCleanup_imp::PostProtFeatfBC (
    CProt_ref& prot_ref
)
{
    // remove obsolete/stale Dbtags
    EDIT_EACH_DBXREF_ON_PROTREF (dbx_it, prot_ref) {
        CDbtag& dbt = **dbx_it;
        if (s_DbtagIsBad (dbt)) {
            ERASE_DBXREF_ON_PROTREF (dbx_it, prot_ref);
            ChangeMade (CCleanupChange::eCleanDbxrefs);
        }
    }

    // sort/uniq the dbxrefs
    if (! DBXREF_ON_PROTREF_IS_SORTED (prot_ref, s_DbtagCompare)) {
        SORT_DBXREF_ON_PROTREF (prot_ref, s_DbtagCompare);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }
    if (! DBXREF_ON_PROTREF_IS_UNIQUE (prot_ref, s_DbtagEqual)) {
        UNIQUE_DBXREF_ON_PROTREF (prot_ref, s_DbtagEqual);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }

    if( RAW_FIELD_IS_EMPTY_OR_UNSET(prot_ref, Desc) ) {
        RESET_FIELD(prot_ref, Desc);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }
}


typedef SStaticPair<const char*, const char*>  TInTrSpElem;
static const TInTrSpElem sc_its_map[] = {
    { "internal transcribed spacer 1 (ITS1)", "internal transcribed spacer 1" },
    { "internal transcribed spacer 2 (ITS2)", "internal transcribed spacer 2" },
    { "internal transcribed spacer 3 (ITS3)", "internal transcribed spacer 3" },
    { "its 1", "internal transcribed spacer 1" },
    { "its 2", "internal transcribed spacer 2" },
    { "its 3", "internal transcribed spacer 3" },
    { "its1", "internal transcribed spacer 1" },
    { "its2", "internal transcribed spacer 2" },
    { "its3", "internal transcribed spacer 3" },
    { "Ribosomal DNA internal transcribed spacer 1", "internal transcribed spacer 1" },
    { "Ribosomal DNA internal transcribed spacer 2", "internal transcribed spacer 2" },
    { "Ribosomal DNA internal transcribed spacer 3", "internal transcribed spacer 3" }
};
typedef CStaticArrayMap<string, string, PNocase> TInTrSpMap;
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TInTrSpMap, sc_ITSMap, sc_its_map);

void CNewCleanup_imp::x_TranslateITSName( string &in_out_name )
{
    TInTrSpMap::const_iterator its_iter = sc_ITSMap.find(in_out_name);
    if( its_iter != sc_ITSMap.end() ) {
        in_out_name = its_iter->second;
        ChangeMade(CCleanupChange::eChangeITS);
    }
}

static const char* const ncrna_names [] = {
    "antisense_RNA",
    "autocatalytically_spliced_intron",
    "guide_RNA",
    "hammerhead_ribozyme",
    "lncRNA",
    "miRNA",
    "other",
    "piRNA",
    "rasiRNA",
    "ribozyme",
    "RNase_MRP_RNA",
    "RNase_P_RNA",
    "scRNA",
    "siRNA",
    "snoRNA",
    "snRNA",
    "SRP_RNA",
    "telomerase_RNA",
    "vault_RNA",
    "Y_RNA"
};

typedef CStaticArraySet<string, PNocase> TNcrna;
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TNcrna, sc_NcrnafNames, ncrna_names);

static bool s_IsNcrnaName (
    const string& name
)

{
    return sc_NcrnafNames.find(name) != sc_NcrnafNames.end();
}

static bool s_StartsWithNcrnaName( 
    const string& name,
    const string **out_ncrna_name = NULL )
{
    TNcrna::const_iterator suffix_finder = s_FindInSetAsPrefix<TNcrna>( name, sc_NcrnafNames );
    if( suffix_finder == sc_NcrnafNames.end() ) {
        if( NULL != out_ncrna_name ) {
            *out_ncrna_name = NULL;
        }
        return false;
    } else {
        if( NULL != out_ncrna_name ) {
            *out_ncrna_name = &*suffix_finder;
        }
        return true;
    }
}

// special exception for genome pipeline rRNA names
static
bool s_NotExceptedRibosomalName( const string &name )
{
    // we are "not excepted" if there is a non-space/non-digit somewhere after " ribosomal"
    CCachedRegexp regex = regexpCache.Get(" ribosomal.*[^ 0-9]");
    return regex->IsMatch(name);
}


void 
CNewCleanup_imp::x_RRNANameBC( string &name )
{
    const string original_name = name;

    if ( name.length() > 5 && s_NotExceptedRibosomalName (name)) {
        // suffix is *after* first match of suffix_regex
        CCachedRegexp suffix_regex = regexpCache.Get( 
            " (ribosomal|rRNA) ( ?RNA)?( ?DNA)?( ?ribosomal)?" );
        if( suffix_regex->IsMatch(name) ) {

            // extract suffix
            const SIZE_TYPE suff_pos = ( suffix_regex->GetResults(0)[1] );
            string suff = name.substr(suff_pos);
            NStr::TruncateSpacesInPlace(suff);

            // cut ribosomal stuff off of name
            const SIZE_TYPE ribosomal_pos = suffix_regex->GetResults(0)[0];
            name.resize( ribosomal_pos );

            name += " ribosomal RNA"; 
            if ( ! suff.empty() ) {
                if (suff[0] != ',' && suff[0] != ';') {
                    name += " ";
                }
                name += suff;
            }
        }
    }
    if ( name.length() > 5) {
        // pos is the position of the first non-digit, non-dot character
        SIZE_TYPE pos = name.find_first_not_of(".0123456789");
        if( NPOS != pos ) {
            if( name[pos] == 's' && name[pos+1] == ' ' ) {
                name[pos] = 'S';
            }
        }
    }
    x_StripSpacesMarkChanged (name);

    // remove duplicate words and similar corrections
    // ( Behold the power of regular expressions; This while loop was about 80 lines in C. )
    do {
        x_StripSpacesMarkChanged(name);
    } while( s_RegexpReplace( name, "ribosomal +ribosomal", "ribosomal ") || 
           s_RegexpReplace( name, "RNA +RNA", "RNA ") || 
           s_RegexpReplace( name, "ribosomal +RNA +ribosomal", "ribosomal RNA ") ||
           s_RegexpReplace( name, "ribosomal +rRNA", "ribosomal RNA ") ||
           s_RegexpReplace( name, "RNA +rRNA", "RNA ") );

    NStr::TruncateSpacesInPlace(name);

    if( original_name != name ) {
        ChangeMade(CCleanupChange::eChangeRNAref);
    }
}

void CNewCleanup_imp::RnarefBC (
    CRNA_ref& rr
)

{
    if (FIELD_IS_SET (rr, Ext)) {
        CRNA_ref::C_Ext& ext = GET_MUTABLE (rr, Ext);
        const TRNAREF_EXT chs = ext.Which();
        switch (chs) {
            case NCBI_RNAEXT(Name):
                {
                    string& name = GET_MUTABLE (ext, Name);
                    if (NStr::IsBlank (name)) {
                        RESET_FIELD (rr, Ext);
                        ChangeMade(CCleanupChange::eChangeRNAref);
                        break;
                    }

                    static const string rRNA = " rRNA";
                    static const string rRNA2 = "_rRNA";
                    static const string kRibosomal_Rna = " ribosomal RNA";
                    static const string kRibosomal_r_Rna = " ribosomal rRNA";

                    if (rr.IsSetType()) {
                        switch (rr.GetType()) {
                            case CRNA_ref::eType_rRNA:
                            {{
                                size_t len = name.length();
                                if (len >= rRNA.length() ) {
                                    if( NStr::EndsWith(name, rRNA, NStr::eNocase) || NStr::EndsWith(name, rRNA2, NStr::eNocase) ) {
                                        if( NStr::EndsWith(name, kRibosomal_r_Rna, NStr::eNocase) ) {
                                            name.replace(len - kRibosomal_r_Rna.length(), name.size(), kRibosomal_Rna);
                                        } else {
                                            name.replace(len - rRNA.length(), name.size(), kRibosomal_Rna);
                                        }
                                        ChangeMade(CCleanupChange::eChangeQualifiers);
                                    }
                                }

                                x_RRNANameBC( name );

                                break;
                            }}
                            case CRNA_ref::eType_other:
                            case CRNA_ref::eType_miscRNA:
                                {{
                                    x_TranslateITSName(name); 

                                    // convert to RNA-gen
                                    string name_copy; // copy because name is about to be destroyed
                                    name_copy.swap( name );
                                    ext.SetGen().SetProduct( name_copy );
                                    ChangeMade(CCleanupChange::eChangeRNAref);
                                }}
                                break;
                            default:
                                break;
                        }
                    }
                }
                break;
            case NCBI_RNAEXT(TRNA):
                {
                    CTrna_ext& tRNA = GET_MUTABLE (ext, TRNA);
                    if (FIELD_IS_SET (tRNA, Aa)) {
                        const CTrna_ext::C_Aa& aa = GET_FIELD (tRNA, Aa);
                        if (aa.Which() == CTrna_ext::C_Aa::e_not_set) {
                            RESET_FIELD (tRNA, Aa);
                            ChangeMade(CCleanupChange::eChangeRNAref);
                        }
                    }

                    if (! CODON_ON_TRNAEXT_IS_SORTED(tRNA, s_CodonCompare)) {
                        SORT_CODON_ON_TRNAEXT(tRNA, s_CodonCompare);
                        ChangeMade(CCleanupChange::eChange_tRna);
                    }

                    if( ! CODON_ON_TRNAEXT_IS_UNIQUE(tRNA, s_CodonEqual) ) {
                        UNIQUE_CODON_ON_TRNAEXT(tRNA, s_CodonEqual);
                        ChangeMade(CCleanupChange::eChange_tRna);
                    }

                    REMOVE_IF_EMPTY_CODON_ON_TRNAEXT(tRNA);

                    if (! FIELD_IS_SET (tRNA, Aa) &&
                        ! FIELD_IS_SET (tRNA, Codon) &&
                        ! FIELD_IS_SET (tRNA, Anticodon)) {
                        RESET_FIELD (rr, Ext);
                        ChangeMade(CCleanupChange::eChangeRNAref);
                    }
                }
                break;
            case NCBI_RNAEXT(Gen):
                {
                    CRNA_gen& gen = GET_MUTABLE (ext, Gen);
                    if (FIELD_IS_SET (gen, Class)) {
                        const string& str = GET_FIELD (gen, Class);
                        if (NStr::IsBlank (str)) {
                            RESET_FIELD (gen, Class);
                            ChangeMade(CCleanupChange::eChangeRNAref);
                        }
                    }
                    if (FIELD_IS_SET (gen, Product)) {
                        const string& str = GET_FIELD (gen, Product);
                        if (NStr::IsBlank (str)) {
                            RESET_FIELD (gen, Product);
                            ChangeMade(CCleanupChange::eChangeRNAref);
                        }
                    }
                    if (FIELD_IS_SET (gen, Quals)) {
                        CRNA_qual_set& qset = GET_MUTABLE (gen, Quals);
                        EDIT_EACH_QUAL_ON_RNAQSET (qitr, qset) {
                            CRNA_qual& qual = **qitr;
                            CLEAN_STRING_MEMBER (qual, Qual);
                            CLEAN_STRING_MEMBER (qual, Val);
                            if( ! FIELD_IS_SET(qual, Qual) || ! FIELD_IS_SET(qual, Val) ) {
                                ERASE_QUAL_ON_RNAQSET( qitr, qset );
                                ChangeMade(CCleanupChange::eChangeRNAref);
                            }
                        }

                        if( QUAL_ON_RNAQSET_IS_EMPTY(qset)  ) {
                            RESET_FIELD(gen, Quals);
                            ChangeMade(CCleanupChange::eChangeRNAref);
                        }
                    }

                    if ( FIELD_EQUALS(rr, Type, NCBI_RNAREF(miscRNA)) && 
                        FIELD_IS_SET (gen, Product) && 
                        ! FIELD_IS_SET (gen, Class) )
                    {
                        string & product = GET_MUTABLE(gen, Product);
                        const string *ncrna_name = NULL;
                        if( s_StartsWithNcrnaName(product, &ncrna_name) ) {
                            _ASSERT( NULL != ncrna_name );
                            if( product.length() > (ncrna_name->length() + 1) && 
                                product[ncrna_name->length()] == ' ' ) 
                            {
                                SET_FIELD( gen, Class, *ncrna_name );
                                SET_FIELD( gen, Product, product.substr(ncrna_name->length() + 1) );
                                TRUNCATE_SPACES( gen, Class );
                                TRUNCATE_SPACES( gen, Product );
                                SET_FIELD( rr, Type, NCBI_RNAREF(ncRNA) );
                                ChangeMade(CCleanupChange::eChangeRNAref);
                            }
                            // no need to erase ncrna_name because it points to 
                            // global memory.
                        }
                    }

                    if( ( FIELD_EQUALS(rr, Type, NCBI_RNAREF(mRNA)) || 
                          FIELD_EQUALS(rr, Type, NCBI_RNAREF(rRNA)) ) &&
                        STRING_FIELD_NOT_EMPTY(gen, Product) && 
                        RAW_FIELD_IS_EMPTY_OR_UNSET(gen, Class) && 
                        ! FIELD_IS_SET(gen, Quals) )
                    {
                        // convert RNA-Gen to name.
                        // Careful: this invalidates the "gen" variable.
                        const string product = GET_FIELD(gen, Product);
                        SET_FIELD( ext, Name, product );
                        break;
                    }

                    if (! FIELD_IS_SET (gen, Class) &&
                        ! FIELD_IS_SET (gen, Product) &&
                        ! FIELD_IS_SET (gen, Quals)) {
                        RESET_FIELD (rr, Ext);
                        ChangeMade(CCleanupChange::eChangeRNAref);
                    }
                }
                break;
            default:
                break;
        }
    }

    if (FIELD_IS_SET (rr, Type)) {
        TRNAREF_TYPE typ = GET_FIELD (rr, Type);
        switch (typ) {
            case NCBI_RNAREF(mRNA):
                {
                }
                break;
            case NCBI_RNAREF(tRNA):
                {
                }
                break;
            case NCBI_RNAREF(rRNA):
                {
                }
                break;
            case NCBI_RNAREF(other):
                {
                    if (FIELD_IS_SET (rr, Ext)) {
                        CRNA_ref::C_Ext& ext = GET_MUTABLE (rr, Ext);
                        const TRNAREF_EXT chs = ext.Which();
                        if (chs == NCBI_RNAEXT(Name)) {
                            string& str = GET_MUTABLE (ext, Name);
                            if ( str.empty() || NStr::EqualNocase (str, "misc_RNA")) {
                                SET_FIELD( rr, Type, NCBI_RNAREF(miscRNA) );
                                RESET_FIELD(rr, Ext);
                                ChangeMade(CCleanupChange::eChangeRNAref);
                            } else if (NStr::EqualNocase (str, "ncRNA")) {
                                SET_FIELD( rr, Type, NCBI_RNAREF(ncRNA) );
                                RESET_FIELD(rr, Ext);
                                ChangeMade(CCleanupChange::eChangeRNAref);
                            } else if (NStr::EqualNocase (str, "tmRNA")) {
                                SET_FIELD( rr, Type, NCBI_RNAREF(tmRNA) );
                                RESET_FIELD(rr, Ext);
                                ChangeMade(CCleanupChange::eChangeRNAref);
                            } else if (s_IsNcrnaName (str)) {
                                SET_FIELD( rr, Type, NCBI_RNAREF(ncRNA) );
                                const string new_class = str;
                                SET_FIELD( rr.SetExt().SetGen(), Class, new_class );
                                ChangeMade(CCleanupChange::eChangeRNAref);
                            } else {
                                SET_FIELD( rr, Type, NCBI_RNAREF(miscRNA) );
                                const string new_product = str;
                                SET_FIELD( rr.SetExt().SetGen(), Product, new_product );
                                ChangeMade(CCleanupChange::eChangeRNAref);
                            }
                        }
                    } else {
                        SET_FIELD( rr, Type, NCBI_RNAREF(miscRNA) );
                        ChangeMade(CCleanupChange::eChangeRNAref);
                    }
                }
                break;
            default:
                break;
        }
    }
}

void
CNewCleanup_imp::x_AddNonCopiedQual( 
    vector< CRef< CGb_qual > > &out_quals, const char *qual, const char *val )
{
    // bail out if this qual already exists
    ITERATE( vector< CRef< CGb_qual > >, qual_iter, out_quals ) {
        if( (*qual_iter)->IsSetQual() && (*qual_iter)->GetQual() == qual &&
            (*qual_iter)->IsSetVal()  && (*qual_iter)->GetVal()  == val ) 
        {
                return;
        }
    }

    CRef< CGb_qual > new_qual( new CGb_qual(qual, val) );
    out_quals.push_back( new_qual );
    ChangeMade( CCleanupChange::eAddQualifier );
}

void CNewCleanup_imp::x_GBQualToOrgRef( COrg_ref &org, CSeq_feat &seqfeat )
{
    if( ! FIELD_IS_SET( seqfeat, Qual ) ) {
        return;
    }

    EDIT_EACH_GBQUAL_ON_SEQFEAT( qual_iter, seqfeat ) {
        CGb_qual &gb_qual = **qual_iter;
        if( FIELD_IS_SET(gb_qual, Qual) && FIELD_IS_SET(gb_qual, Val) ) {
            const string qual = NStr::Replace( GET_FIELD(gb_qual, Qual), "_", "-" );
            const string &val = GET_FIELD(gb_qual, Val);

            // determine whether we should convert this gbqual into an orgmod
            bool do_gbqual_to_orgmod = false;
            TOrgModMap::const_iterator orgmod_it = s_FindInMapAsPrefix<TOrgModMap>( qual, sc_OrgModMap );
            if( orgmod_it != sc_OrgModMap.end() && orgmod_it->second != NCBI_ORGMOD(nat_host) ) {
                do_gbqual_to_orgmod = true;
            }
            if( ! do_gbqual_to_orgmod ) {
                TOrgModMap::const_iterator orgmodalias_it = s_FindInMapAsPrefix<TOrgModMap>( qual, sc_OrgModAliasMap );
                if( orgmodalias_it != sc_OrgModAliasMap.end() && orgmodalias_it->second != NCBI_ORGMOD(nat_host) ) {
                    do_gbqual_to_orgmod = true;
                }
            }
            if( ! do_gbqual_to_orgmod ) {
                if( s_FindInMapAsPrefix<TSubsourceMap>( qual, sc_SubsourceMap) != sc_SubsourceMap.end() ||
                    s_FindInMapAsPrefix<TSubsourceMap>( qual, sc_SubsourceAliasMap) != sc_SubsourceAliasMap.end() ) 
                {
                    do_gbqual_to_orgmod = true;
                }
            }

            // if required, do the conversion
            if( do_gbqual_to_orgmod ) {
                org.SetMod().push_back( qual + "=" + val );
                ERASE_GBQUAL_ON_SEQFEAT( qual_iter, seqfeat );
                ChangeMade(CCleanupChange::eAddOrgMod);
                ChangeMade(CCleanupChange::eRemoveQualifier);
            }
        }
    }
}

void CNewCleanup_imp::x_MoveSeqdescOrgToSourceOrg( CSeqdesc &seqdesc )
{
    if( seqdesc.IsOrg() ) {
        // wrap Org_ref in BioSource
        CRef <COrg_ref> org ( &GET_MUTABLE(seqdesc, Org) );
        seqdesc.SetSource().SetOrg(*org);
        ChangeMade (CCleanupChange::eMoveDescriptor);
    }
}

void CNewCleanup_imp::x_MoveSeqfeatOrgToSourceOrg( CSeq_feat &seqfeat )
{
    if( FIELD_IS_SET_AND_IS(seqfeat, Data, Org) ) {
        // wrap Org_ref in BioSource
        CRef <COrg_ref> org ( &GET_MUTABLE(seqfeat.SetData(), Org) );
        seqfeat.SetData().SetBiosrc().SetOrg(*org);
        ChangeMade (CCleanupChange::eConvertFeature);
    }
}

void CNewCleanup_imp::x_CleanupStringMarkChanged( std::string &str )
{
    if (CleanVisString (str)) {
        ChangeMade (CCleanupChange::eTrimSpaces);
    }
}

void CNewCleanup_imp::x_CleanupStringJunkMarkChanged( std::string &str )
{
    if (CleanVisStringJunk (str)) {
        ChangeMade (CCleanupChange::eTrimSpaces);
    }
}

bool CNewCleanup_imp::x_CompressSpaces( string &str )
{
    return s_RegexpReplace( str, " [ ]+", " " );
}

void CNewCleanup_imp::x_CompressStringSpacesMarkChanged( std::string &str )
{
  const string::size_type old_length = str.length();

  x_CompressSpaces( str );

  const string::size_type new_length = str.length();
  if( old_length != new_length ) {
    ChangeMade (CCleanupChange::eCompressSpaces);
  }
}

void CNewCleanup_imp::x_ConvertDoubleQuotesMarkChanged( std::string &str )
{
    if( CleanDoubleQuote(str) ) {
        ChangeMade (CCleanupChange::eCleanDoubleQuotes);
    }
}

static void 
s_AddStringToUserField( CSeqdesc_Base::TUser& user, const char* field, const char *str )
{
    CRef< CUser_field > new_field( new CUser_field );

    new_field->SetLabel().SetStr( field );
    new_field->SetData().SetStr( CUtf8::AsUTF8(str, eEncoding_Ascii) );

    user.SetData().push_back( new_field );
}

static void 
s_AddIntegerToUserField
( CSeqdesc_Base::TUser& user, const char* field, int num )
{
    CRef< CUser_field > new_field( new CUser_field );

    new_field->SetLabel().SetStr( field );
    new_field->SetData().SetInt(num);

    user.SetData().push_back( new_field );
}

void CNewCleanup_imp::x_AddNcbiCleanupObject( CSeq_entry &seq_entry )
{
    CRef<CSeqdesc> ncbi_cleanup_object( new CSeqdesc );
    CSeqdesc_Base::TUser& user = ncbi_cleanup_object->SetUser();

    user.SetType().SetStr("NcbiCleanup");

    s_AddStringToUserField(  user, "method", "ExtendedSeqEntryCleanup" );
    s_AddIntegerToUserField( user, "version", NCBI_CLEANUP_VERSION );
  
    // get current time
    CTime curr_time( CTime::eCurrent );
    s_AddIntegerToUserField( user, "month", curr_time.Month() );
    s_AddIntegerToUserField( user, "day",   curr_time.Day() );
    s_AddIntegerToUserField( user, "year",  curr_time.Year() );

    seq_entry.SetDescr().Set().push_back( ncbi_cleanup_object );

    ChangeMade(CCleanupChange::eAddNcbiCleanupObject);
}

static
string
s_GetMiRNAProduct( const string &name )
{
    if ( NStr::StartsWith(name, "miRNA ") ) {
        return name.substr(6);
    } else if ( NStr::StartsWith(name, "microRNA ") ) {
        return name.substr(9);
    } else {
        if ( NStr::EndsWith(name, " miRNA") &&
            ! NStr::EndsWith(name, "precursor miRNA") )
        {
            return name.substr(0, name.length() - 6);
        }
        else if (  NStr::EndsWith( name, " microRNA") &&
            ! NStr::EndsWith(name, "precursor microRNA") )
        {
            return name.substr(0, name.length() - 9 );
        }
    }
    return kEmptyStr;
}

void CNewCleanup_imp::RnaFeatBC (
    CRNA_ref& rna,
    CSeq_feat& seq_feat
)

{
    // move rna.pseudo to feat.pseudo
    if ( FIELD_IS_SET(rna, Pseudo) ) {
        SET_FIELD(seq_feat, Pseudo, true);
        RESET_FIELD(rna, Pseudo);
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }

    if ( rna.IsSetExt() &&
        rna.GetExt().IsTRNA() ) 
    {                
        CTrna_ext &tRNA = rna.SetExt().SetTRNA();
        x_SeqFeatTRNABC( seq_feat, tRNA );

        if( seq_feat.IsSetLocation() && 
            tRNA.IsSetAnticodon() &&
            tRNA.GetAnticodon().IsInt() ) 
        {
            const CSeq_id *loc_id = seq_feat.GetLocation().GetId();
            const CSeq_id *ac_id  = tRNA.GetAnticodon().GetId();
            if( loc_id && ac_id && loc_id->Compare( *ac_id ) == CSeq_id::e_YES ) {
                const ENa_strand loc_strand = seq_feat.GetLocation().GetStrand();
                const ENa_strand ac_strand = tRNA.GetAnticodon().GetStrand();
                if (loc_strand == eNa_strand_minus && ac_strand != eNa_strand_minus) {
                    tRNA.SetAnticodon().SetInt().SetStrand(eNa_strand_minus);
                    ChangeMade (CCleanupChange::eChangeAnticodon);
                }
            }
        }
    }

    // Add fMet-related comments
    if( (! FIELD_IS_SET(rna, Ext) || FIELD_IS(rna.GetExt(), Gen) ) &&
        FIELD_EQUALS( rna, Type, NCBI_RNAREF(tRNA) ) && 
        STRING_FIELD_NOT_EMPTY(seq_feat, Comment) )
    {
        bool justTrnaText = false;
        string codon;
        char aa = s_ParseSeqFeatTRnaString( GET_FIELD(seq_feat, Comment), 
            &justTrnaText, codon, true );
        if( aa != '\0' ) {
            CRef<CTrna_ext> tRNA( new CTrna_ext );
            tRNA->SetAa().SetNcbieaa( aa );
            if (justTrnaText) {
                copy( codon.begin(), codon.end(), back_inserter(tRNA->SetCodon()) );
            }
            rna.SetExt().SetTRNA( *tRNA );
            ChangeMade(CCleanupChange::eChange_tRna);
            if (justTrnaText) {
                if ( GET_FIELD(seq_feat, Comment) != "fMet" &&
                     GET_FIELD(seq_feat, Comment) != "fMet tRNA" &&
                     GET_FIELD(seq_feat, Comment) != "fMet-tRNA" ) {
                        RESET_FIELD( seq_feat, Comment );
                        ChangeMade(CCleanupChange::eRemoveComment);
                } else {
                    SET_FIELD( seq_feat, Comment, "fMet" );
                    ChangeMade(CCleanupChange::eChangeComment);
                }
            }
        }
    }

    // "S ribosomal RNA" logic
    if ( ! FIELD_IS_SET(rna, Ext) && 
        STRING_FIELD_NOT_EMPTY(seq_feat, Comment) &&
        FIELD_EQUALS( rna, Type, NCBI_RNAREF(rRNA) ) ) 
    {
        const int comment_len = GET_FIELD(seq_feat, Comment).length();
        if (comment_len > 15 && comment_len < 20) {
            if ( NStr::EndsWith(GET_FIELD(seq_feat, Comment), "S ribosomal RNA", NStr::eNocase) ) {
                rna.SetExt().SetName( GET_FIELD(seq_feat, Comment) );
                ChangeMade(CCleanupChange::eChangeRNAref);
                RESET_FIELD(seq_feat, Comment);
                ChangeMade(CCleanupChange::eRemoveComment);
            }
        } else if (comment_len > 6 && comment_len < 20) {
            if ( NStr::EndsWith(GET_FIELD(seq_feat, Comment), "S rRNA", NStr::eNocase) ) {
                rna.SetExt().SetName( GET_FIELD(seq_feat, Comment) );
                ChangeMade(CCleanupChange::eChangeRNAref);
                RESET_FIELD(seq_feat, Comment);
                ChangeMade(CCleanupChange::eRemoveComment);
            }
        }
    }

    // mRNA logic
    if( ! FIELD_IS_SET(rna, Ext) &&
        STRING_FIELD_NOT_EMPTY(seq_feat, Comment) &&
        FIELD_EQUALS( rna, Type, NCBI_RNAREF(mRNA) ) ) 
    {
        if ( NStr::EndsWith( GET_FIELD(seq_feat, Comment), " RNA",  NStr::eNocase ) ||
             NStr::EndsWith( GET_FIELD(seq_feat, Comment), " mRNA", NStr::eNocase ) )
        {
            rna.SetExt().SetName( GET_FIELD(seq_feat, Comment) );
            ChangeMade(CCleanupChange::eChangeRNAref);
            RESET_FIELD(seq_feat, Comment);
            ChangeMade(CCleanupChange::eRemoveComment);
        }
    }

    // ITS logic
    if( FIELD_EQUALS( rna, Type, NCBI_RNAREF(other)) || 
        FIELD_EQUALS( rna, Type, NCBI_RNAREF(miscRNA) ) ) 
    {
        if( FIELD_IS_SET_AND_IS(rna, Ext, Name) ) {
            x_TranslateITSName( GET_MUTABLE(rna.SetExt(), Name) );
        } else if( FIELD_IS_SET(rna, Ext) && FIELD_IS( rna.GetExt(), Gen) &&
            FIELD_IS_SET(rna.GetExt().GetGen(), Product) ) 
        {
            x_TranslateITSName( rna.SetExt().SetGen().SetProduct() );
        }
    }

    // if RNA is type "tRNA" and ext.tRNA is set, remove any feat.comments which
    // are redundant (e.g. comment is "aa: Alanine", when alanine is what the tRNA encodes)
    if( STRING_FIELD_NOT_EMPTY(seq_feat, Comment) && 
        FIELD_EQUALS( rna, Type, NCBI_RNAREF(tRNA) ) &&
        FIELD_IS_SET_AND_IS(rna, Ext, TRNA) && 
        FIELD_IS_SET(rna.GetExt().GetTRNA(), Aa) )
    {
        // extract the part of the comment we care about
        string::size_type comment_start_pos = 0;
        if( NStr::StartsWith(GET_FIELD(seq_feat, Comment), "aa:") ) {
            comment_start_pos += 3; // 3 is len of "aa:"
        }
        comment_start_pos = GET_FIELD(seq_feat, Comment).find_first_not_of(" ", comment_start_pos);
        if( string::npos == comment_start_pos ) {
            comment_start_pos = GET_FIELD(seq_feat, Comment).length();
        }
        const string comment = GET_FIELD(seq_feat, Comment).substr(comment_start_pos);

        // convert to ncbieaa to standardize it
        const char aa = s_ConvertTrnaAaToLetter( rna.GetExt().GetTRNA().GetAa(), CSeqUtil::e_Ncbieaa );

        if( comment.length() == 1 && comment[0] == aa ) {
            RESET_FIELD(seq_feat, Comment);
            ChangeMade(CCleanupChange::eChangeComment);
        } else {
            // find the letter, 3-letter, and full name of the given aa (amino acid)
            CAminoAcidCharToSymbol::const_iterator aa_iter = sm_TrnaInverseKeys.lower_bound(aa);
            CAminoAcidCharToSymbol::const_iterator aa_end  = sm_TrnaInverseKeys.upper_bound(aa);
            for( ; aa_iter != aa_end ; ++aa_iter ) {
                const string &a_name = aa_iter->second;
                if( comment == a_name && ( aa != 'M' || ! NStr::EqualNocase(a_name, "fMet") ) ) {
                    RESET_FIELD(seq_feat, Comment);
                    ChangeMade(CCleanupChange::eChangeComment);
                    break;
                }
            }
        }
    }

    // remove feat.comment if redundant with parts of ext.gen
    if( STRING_FIELD_NOT_EMPTY(seq_feat, Comment) &&
        FIELD_IS_SET_AND_IS(rna, Ext, Gen) )
    {
        const string &comment = GET_FIELD(seq_feat, Comment);
        const CRNA_gen &gen = rna.GetExt().GetGen();
        if( FIELD_EQUALS(gen, Class, comment) || 
            FIELD_EQUALS(gen, Product, comment) ) 
        {
            RESET_FIELD(seq_feat, Comment);
            ChangeMade(CCleanupChange::eChangeComment);
        } else {
            FOR_EACH_QUAL_ON_RNAGEN( qual_iter, gen ) {
                const CRNA_qual &rna_qual = **qual_iter;
                if( FIELD_EQUALS(rna_qual, Val, comment) ) {
                    RESET_FIELD(seq_feat, Comment);
                    ChangeMade(CCleanupChange::eChangeComment);
                    break;
                }
            }
        }
    }

    // this part is like C's ConvertToNcRNA
    {
        const CRNA_ref_Base::TType rna_type = 
            ( rna.IsSetType() ? rna.GetType() : NCBI_RNAREF(unknown) );
        if (rna_type == NCBI_RNAREF(snRNA) || 
            rna_type == NCBI_RNAREF(scRNA) || 
            rna_type == NCBI_RNAREF(snoRNA) )
        {
            if (rna_type == NCBI_RNAREF(snRNA)) {
                x_AddNonCopiedQual ( seq_feat.SetQual(), "ncRNA_class", "snRNA");
            } else if (rna_type == NCBI_RNAREF(scRNA)) {
                x_AddNonCopiedQual ( seq_feat.SetQual(), "ncRNA_class", "scRNA");
            } else if (rna_type == NCBI_RNAREF(snoRNA)) {
                x_AddNonCopiedQual ( seq_feat.SetQual(), "ncRNA_class", "snoRNA");
            }
            if ( rna.IsSetExt() && rna.GetExt().IsName() )
            {
                x_AddNonCopiedQual (seq_feat.SetQual(), "product", rna.GetExt().GetName().c_str() );
            }
            rna.SetExt().SetName("ncRNA");
            rna.SetType( NCBI_RNAREF(other) );
            ChangeMade( CCleanupChange::eChangeRNAref );
        }
        else if (rna_type == NCBI_RNAREF(other) && rna.IsSetExt() && rna.GetExt().IsName() )
        {
            string &rna_name = rna.SetExt().SetName();
            string miRNAproduct;
            if (s_IsNcrnaName(rna.GetExt().GetName())) 
            {
                x_AddNonCopiedQual (seq_feat.SetQual(), "ncRNA_class", rna_name.c_str() );
                rna.SetExt().SetName("ncRNA");
                ChangeMade( CCleanupChange::eChangeRNAref );
            }
            else if ( ! (miRNAproduct = s_GetMiRNAProduct (rna_name)).empty() )
            {
                x_AddNonCopiedQual ( seq_feat.SetQual(), "ncRNA_class", "miRNA");
                rna_name = "ncRNA";
                x_AddNonCopiedQual ( seq_feat.SetQual(), "product", miRNAproduct.c_str() );
                ChangeMade( CCleanupChange::eChangeRNAref );
            }
            else if ( 
                rna_name != "ncRNA" && 
                rna_name != "tmRNA" &&
                rna_name != "misc_RNA" )
            {
                x_AddNonCopiedQual ( seq_feat.SetQual(), "product", rna_name.c_str() );
                rna_name = "misc_RNA";
                ChangeMade( CCleanupChange::eChangeRNAref );
            }
        }
        if (rna_type == NCBI_RNAREF(other) && ! rna.IsSetExt() ) {
            rna.SetExt().SetName( "misc_RNA" );
            ChangeMade( CCleanupChange::eChangeRNAref );
        }
        if (rna_type == NCBI_RNAREF(other) && rna.IsSetExt() && rna.GetExt().IsName() &&
            rna.GetExt().GetName() == "misc_RNA" ) 
        {
            NON_CONST_ITERATE( CSeq_feat::TQual, qual_iter, seq_feat.SetQual() ) {
                string &qual = (*qual_iter)->SetQual();
                if ( qual == "ncRNA_class" ) {
                    rna.SetExt().SetName( "ncRNA" );
                    ChangeMade( CCleanupChange::eChangeRNAref );
                } else if ( qual == "tag_peptide") {
                    rna.SetExt().SetName( "tmRNA" );
                    ChangeMade( CCleanupChange::eChangeRNAref );
                } else if ( qual == "product" ) {
                    // e.g. "its1" to "internal transcribed spacer 1"
                    x_TranslateITSName( (*qual_iter)->SetVal() );
                }
            }
        }
    }

    // This part is like C's FixncRNAClass
    {
        const CRNA_ref_Base::TType rna_type = 
            ( rna.IsSetType() ? rna.GetType() : NCBI_RNAREF(unknown) );
        if( rna_type == NCBI_RNAREF(ncRNA) && rna.IsSetExt() &&
            rna.GetExt().IsGen() && rna.GetExt().GetGen().IsSetClass() &&
            NStr::EqualNocase( rna.GetExt().GetGen().GetClass(), "antisense")  ) 
        {
            rna.SetExt().SetGen().SetClass("antisense_RNA");
        }
    }

    // this part is like C's ModernizeRNAFields
    const CRNA_ref_Base::TType rna_type = 
        ( rna.IsSetType() ? rna.GetType() : NCBI_RNAREF(unknown) );
    if( rna_type == NCBI_RNAREF(other) && rna.IsSetExt() && rna.GetExt().IsName() )
    {
        const string &name = rna.GetExt().GetName();
        bool need_qual_cleanup = true;
        if( name == "ncRNA" ) {
            SET_FIELD(rna, Type, NCBI_RNAREF(ncRNA) );
            ChangeMade( CCleanupChange::eChangeRNAref );
        } else if( name == "tmRNA" ) {
            SET_FIELD(rna, Type, NCBI_RNAREF(tmRNA) );
            ChangeMade( CCleanupChange::eChangeRNAref );
        } else if( name == "misc_RNA" ) {
            SET_FIELD(rna, Type, NCBI_RNAREF(miscRNA) );
            ChangeMade( CCleanupChange::eChangeRNAref );
        } else {
            need_qual_cleanup = false;
        }

        if( need_qual_cleanup ) {
            const string name_copy = name; // we're about to destroy name
            CRNA_gen& rna_gen = rna.SetExt().SetGen();

            if( seq_feat.IsSetQual() ) {
                vector< CRef< CGb_qual > > new_qual_vec;
                vector< CRef< CGb_qual > > &qual_vec = GET_MUTABLE(seq_feat, Qual);
                NON_CONST_ITERATE( vector< CRef< CGb_qual > >, qual_iter, qual_vec ) {
                    const string &qual = ( (*qual_iter)->IsSetQual() ? (*qual_iter)->GetQual() : kEmptyStr );
                    const string &val  = ( (*qual_iter)->IsSetVal()  ? (*qual_iter)->GetVal()  : kEmptyStr );
                    if ( qual == "ncRNA_class" ) {
                        rna_gen.SetClass( val );
                    } else if (qual == "product") {
                        rna_gen.SetProduct( val );
                    } else if ( qual ==  "tag_peptide") {
                        CRef<CRNA_qual> rna_qual( new CRNA_qual );
                        rna_qual->SetQual( qual );
                        rna_qual->SetVal( val );

                        rna_gen.SetQuals().Set().push_back( rna_qual );
                    } else {
                        // keep this one
                        new_qual_vec.push_back( *qual_iter );
                    }
                }

                // update qual vec if changed
                if( new_qual_vec.empty() ) {
                    seq_feat.ResetQual();
                    ChangeMade( CCleanupChange::eRemoveQualifier );
                } else if( new_qual_vec.size() != qual_vec.size() ) {
                    seq_feat.SetQual().swap( new_qual_vec );
                    ChangeMade( CCleanupChange::eChangeQualifiers );
                }
            }

            ChangeMade( CCleanupChange::eChangeRNAref );
        }
    }
}

class CCodeBreakCompare
{
public:
    CCodeBreakCompare( const CSeq_loc &seq_feat_location, CRef<CScope> scope ) :
        m_Seq_feat_location(seq_feat_location), m_Scope(scope)
    {

    }

    bool operator()( const CRef<CCode_break> break1, const CRef<CCode_break> break2 )
    {
        // check for missing locs (shouldn't happen, since locations are mandatory)
        const bool has_loc1 = FIELD_IS_SET(*break1, Loc);
        const bool has_loc2 = FIELD_IS_SET(*break2, Loc);
        if( ! has_loc1 || ! has_loc2 ) {
            return (has_loc1 < has_loc2);
        }

        const CSeq_loc &loc1 = GET_FIELD(*break1, Loc);
        const CSeq_loc &loc2 = GET_FIELD(*break2, Loc);

        TSeqPos seq_pos1 =
            sequence::LocationOffset(m_Seq_feat_location, loc1,
            sequence::eOffset_FromStart,
            &*m_Scope);
        TSeqPos seq_pos2 =
            sequence::LocationOffset(m_Seq_feat_location, loc2,
            sequence::eOffset_FromStart,
            &*m_Scope);

        return ( seq_pos1 < seq_pos2 );
    }
private:
    const CSeq_loc &m_Seq_feat_location;
    CRef<CScope> m_Scope;
};

class CCodeBreakEqual 
{
public:
    CCodeBreakEqual( CRef<CScope> scope ) : 
        m_Scope( scope ) { }

    bool operator()( const CRef<CCode_break> break1, const CRef<CCode_break> break2 ) 
    {
        // check for missing locs (shouldn't happen, since locations are mandatory)
        const bool has_loc1 = FIELD_IS_SET(*break1, Loc);
        const bool has_loc2 = FIELD_IS_SET(*break2, Loc); 
        if( has_loc1 != has_loc2 ) {
            return false;
        }

        const CSeq_loc &loc1 = GET_FIELD(*break1, Loc);
        const CSeq_loc &loc2 = GET_FIELD(*break2, Loc);

        if( sequence::eSame != sequence::Compare( loc1, loc2, &*m_Scope ) ) {
            return false;
        }

        const bool aa_set1 = FIELD_IS_SET(*break1, Aa);
        const bool aa_set2 = FIELD_IS_SET(*break2, Aa);
        if( aa_set1 != aa_set2 ) {
            return false;
        } else if( ! aa_set1 && ! aa_set2 ) {
            return true;
        }

        return GET_FIELD(*break1, Aa).Equals( GET_FIELD(*break2, Aa) );
    }

private:
    CRef<CScope> m_Scope;
};

void CNewCleanup_imp::CdregionFeatBC (CCdregion& cds, CSeq_feat& seqfeat)
{
    // move the cdregion's xrefs to their destination protein
    x_MoveCdregionXrefsToProt( cds, seqfeat );

    // make code-break's location on minus strand if seq-feat's location is
    // on minus strand(and both are on the same seqid)
    if( FIELD_IS_SET(seqfeat, Location) ) {
        const ENa_strand seqfeat_loc_strand = GET_FIELD(seqfeat, Location).GetStrand();
        const CSeq_id* seqfeat_loc_id = GET_FIELD(seqfeat, Location).GetId();
        if( (seqfeat_loc_strand == eNa_strand_minus) && (seqfeat_loc_id != NULL) ) {
            EDIT_EACH_CODEBREAK_ON_CDREGION(code_break_iter, cds) {
                CCode_break &code_break = **code_break_iter;
                if( FIELD_IS_SET(code_break, Loc) ) {
                    const ENa_strand code_break_strand = GET_FIELD(code_break, Loc).GetStrand();
                    const CSeq_id* code_break_id = GET_FIELD(code_break, Loc).GetId();
                    if( (code_break_strand != eNa_strand_minus) && (code_break_id != NULL) && 
                        GET_FIELD(code_break, Loc).IsInt() &&
                        code_break_id->Compare(*seqfeat_loc_id) == CSeq_id::e_YES ) 
                    {
                        GET_MUTABLE(code_break, Loc).SetStrand(eNa_strand_minus);
                        ChangeMade( CCleanupChange::eChangeStrand );
                    }
                }
            }
        }
    }

    // sort/uniq code breaks
    CCodeBreakCompare code_break_compare( seqfeat.GetLocation(), m_Scope );
    if( ! CODEBREAK_ON_CDREGION_IS_SORTED(cds, code_break_compare) ) {
        SORT_CODEBREAK_ON_CDREGION(cds, code_break_compare);
        ChangeMade(CCleanupChange::eChangeCodeBreak);
    }

    CCodeBreakEqual code_break_equal( m_Scope );
    if( ! CODEBREAK_ON_CDREGION_IS_UNIQUE(cds, code_break_equal) ) {
        UNIQUE_CODEBREAK_ON_CDREGION(cds, code_break_equal);
        ChangeMade(CCleanupChange::eChangeCodeBreak);
    }

    // check if comment is redundant due to selenocysteine or pyrrolysine
    if( GET_STRING_FLD_OR_BLANK(seqfeat, Comment) == "selenocysteine" || 
        GET_STRING_FLD_OR_BLANK(seqfeat, Comment) == "pyrrolysine" )
    {
        const string & comment = GET_STRING_FLD_OR_BLANK(seqfeat, Comment);
        FOR_EACH_CODEBREAK_ON_CDREGION(code_break_iter, cds) {
            const CCode_break &code_break = **code_break_iter;
            // We only check ncbieaa since that seems to be how the C
            // toolkit behaves.  Maybe in the future, we can also check for
            // ncbi8aa, ncbistdaa, etc.
            if( FIELD_IS_SET_AND_IS(code_break, Aa, Ncbieaa) ) {
                if( GET_FIELD(code_break.GetAa(), Ncbieaa) == 'U' && 
                    comment == "selenocysteine" ) 
                {
                    RESET_FIELD(seqfeat, Comment);
                    ChangeMade(CCleanupChange::eChangeComment);
                } else if( GET_FIELD(code_break.GetAa(), Ncbieaa) == 'O' && 
                    comment == "pyrrolysine" ) 
                {
                    RESET_FIELD(seqfeat, Comment);
                    ChangeMade(CCleanupChange::eChangeComment);
                }
            }
        }
    }

    // check if comment redund with e.c. on product prot
    if( FIELD_IS_SET(seqfeat, Comment) && FIELD_IS_SET(seqfeat, Product) ) {
        const string & comment = GET_FIELD(seqfeat, Comment);
        const CSeq_id* product_seq_id = GET_FIELD(seqfeat, Product).GetId();
        if( product_seq_id != NULL ) {
            CBioseq_Handle product_bioseq = m_Scope->GetBioseqHandle(*product_seq_id);
            if( product_bioseq ) {
                SAnnotSelector sel( CSeqFeatData::e_Prot );
                CFeat_CI feat_ci( product_bioseq, sel );

                for( ; feat_ci ; ++feat_ci ) {
                    if( STRING_SET_MATCH(feat_ci->GetOriginalFeature().GetData().GetProt(), Ec, comment) ) {
                        RESET_FIELD(seqfeat, Comment);
                        ChangeMade(CCleanupChange::eChangeComment);
                        break;
                    }
                }
            }
        }
    }
}

bool CNewCleanup_imp::x_InGpsGenomic( const CSeq_feat& seqfeat )
{
    if( ! FIELD_IS_SET(seqfeat, Location) ) {
        return false;
    }
    const CSeq_id *loc_seq_id = GET_FIELD(seqfeat, Location).GetId();
    if( loc_seq_id == NULL ) {
        return false;
    }
    CBioseq_Handle bioseq_handle = m_Scope->GetBioseqHandle( *loc_seq_id );
    if( ! bioseq_handle ) {
        return false;
    }
    CBioseq_set_Handle parent_bioseq_set_handle = bioseq_handle.GetParentBioseq_set();
    for( ; parent_bioseq_set_handle; 
           parent_bioseq_set_handle = parent_bioseq_set_handle.GetParentBioseq_set() )
    {
        if( ! FIELD_IS_SET(parent_bioseq_set_handle, Class) ) {
            return false;
        }
        if( GET_FIELD(parent_bioseq_set_handle, Class) == CBioseq_set::eClass_nuc_prot ) {
            return false;
        } else if( GET_FIELD(parent_bioseq_set_handle, Class) == CBioseq_set::eClass_gen_prod_set ) {
            return true;
        } 
    }
    return false;
}

enum EMoveNonDuplicatedItemsOpt {
    eMoveNonDuplicatedItemsOpt_ModifySource = 1,
    eMoveNonDuplicatedItemsOpt_DoNotModifySource
};

// For example:
// Let's say that dest is {"abc", "123", "xyz"}
// and src is {"456", "123", "884", "abc"}, then afterwards they will be:
// src: {"123", "abc"} (holds the items we couldn't move over)
// dest: {"abc", "123", "xyz", "456", "884"}
// That's for eMoveNonDuplicatedItemsOpt_ModifySource; if 
// eMoveNonDuplicatedItemsOpt_DoNotModifySource is set, nothing happens.
template< typename TDest, typename TSrc, typename TLessThan >
static
void s_MoveNonDuplicatedItems( TDest &dest, TSrc &src, 
    const TLessThan &less_than, 
    EMoveNonDuplicatedItemsOpt opt )
{
    // first, create a set containing whatever the destination contains for easy
    // lookup later
    set<typename TDest::value_type, TLessThan> dest_items_set( less_than );
    copy( dest.begin(), dest.end(),
          inserter( dest_items_set, dest_items_set.end() ) );

    // holds the items that we couldn't move over
    TSrc new_src;

    typename TSrc::iterator iter = src.begin();
    for( ; iter != src.end(); ++iter ) {
        // only add items not already in dest
        if( dest_items_set.find(*iter) == dest_items_set.end() ) {
            dest.push_back( *iter );
            dest_items_set.insert(*iter);
        } else {
            if( opt == eMoveNonDuplicatedItemsOpt_ModifySource ) {
                new_src.push_back( *iter );
            }
        }
    }

    // some (maybe all?) of srcs items were moved over and deleted,
    // but "new_src" contains the ones we didn't move over.
    // Note that swap should be faster than assignment.
    if( opt == eMoveNonDuplicatedItemsOpt_ModifySource ) {
        src.swap( new_src );
    }
}

// move parts of cds_prot_ref to prot_ref
void s_CopyProtXrefToProtFeat( CProt_ref &prot_ref, CProt_ref &cds_prot_ref )
{
    // move the Db field over
    if( FIELD_IS_SET(cds_prot_ref, Db) ) {
        copy( GET_FIELD(cds_prot_ref, Db).begin(), GET_FIELD(cds_prot_ref, Db).end(),
            back_inserter( GET_MUTABLE(prot_ref, Db) ) );
        RESET_FIELD(cds_prot_ref, Db);
    }

    // move the Name field over
    // (Here, we only move over names which don't already exist in the destination )
    if( FIELD_IS_SET(cds_prot_ref, Name) ) {
      s_MoveNonDuplicatedItems( GET_MUTABLE(prot_ref, Name), GET_MUTABLE(cds_prot_ref, Name), PNocase(), eMoveNonDuplicatedItemsOpt_DoNotModifySource );
    }

    // move the Desc field over (but only if it differs from ours )
    if( FIELD_IS_SET(cds_prot_ref, Desc) ) {
        const string &cds_desc = GET_FIELD(cds_prot_ref, Desc);
        if( ! FIELD_IS_SET(prot_ref, Desc) ) {
            SET_FIELD(prot_ref, Desc, cds_desc);
            RESET_FIELD(cds_prot_ref, Desc);
        } else if ( GET_FIELD(prot_ref, Desc) != cds_desc ) {
            SET_FIELD(prot_ref, Desc, GET_FIELD(prot_ref, Desc) + "; " + cds_desc );
        }
    }

    // move the Ec field over
    if( FIELD_IS_SET(cds_prot_ref, Ec) ) {
      s_MoveNonDuplicatedItems( GET_MUTABLE(prot_ref, Ec), GET_MUTABLE(cds_prot_ref, Ec), PNocase(), eMoveNonDuplicatedItemsOpt_DoNotModifySource );
    }

    // move the Activity field over
    if( FIELD_IS_SET(cds_prot_ref, Activity) ) {
      s_MoveNonDuplicatedItems( GET_MUTABLE(prot_ref, Activity), GET_MUTABLE(cds_prot_ref, Activity), PNocase(), eMoveNonDuplicatedItemsOpt_DoNotModifySource );
    }
}

void CNewCleanup_imp::x_MoveCdregionXrefsToProt (CCdregion& cds, CSeq_feat& seqfeat)
{
    if( ! FIELD_IS_SET(seqfeat, Xref) || ! FIELD_IS_SET(seqfeat, Product) ) {
        return;
    }
    if( x_InGpsGenomic(seqfeat) ) {
        return;
    }

    // get the protein

    // get protein sequence for product
    CSeq_feat_EditHandle feat_with_prot_ref_handle;
    CRef<CSeq_feat> new_feat_with_prot_ref;
    CRef<CProt_ref> prot_ref;
    {
        SAnnotSelector sel;
        sel.SetFeatType( CSeqFeatData::e_Prot );
        CFeat_CI feat_ci( *m_Scope, GET_FIELD(seqfeat, Product), sel );
        if( ! feat_ci ) {
            return;
        }
        feat_with_prot_ref_handle = CSeq_feat_EditHandle( feat_ci->GetSeq_feat_Handle() );

        new_feat_with_prot_ref.Reset( new CSeq_feat );
        new_feat_with_prot_ref->Assign( feat_ci->GetOriginalFeature() );

        prot_ref.Reset( &new_feat_with_prot_ref->SetData().SetProt() );
        if( ! prot_ref ) {
            return;
        }
    }

    EDIT_EACH_XREF_ON_SEQFEAT( xref_iter, seqfeat ) {
        CSeqFeatXref &xref = **xref_iter;
        if( FIELD_IS_SET(xref, Data) && FIELD_IS( xref.GetData(), Prot) ) {
            CProt_ref &cds_prot_ref = GET_MUTABLE( xref.SetData(), Prot);
            s_CopyProtXrefToProtFeat( *prot_ref, cds_prot_ref );
            ERASE_XREF_ON_SEQFEAT( xref_iter, seqfeat );
            ChangeMade(CCleanupChange::eMoveToProtXref);
        }
    }

    feat_with_prot_ref_handle.Replace( *new_feat_with_prot_ref );
}

void CNewCleanup_imp::DeltaExtBC( CDelta_ext & delta_ext, CSeq_inst &seq_inst )
{
    // remove zero-length seq-literals
    if( FIELD_EQUALS( seq_inst, Repr, CSeq_inst::eRepr_delta ) ) {
        EDIT_EACH_DELTASEQ_IN_DELTAEXT( delta_seq_iter, delta_ext ) {
            CDelta_seq &delta_seq = **delta_seq_iter;
            if( delta_seq.IsLiteral() ) {
                const CSeq_literal &the_literal = delta_seq.GetLiteral();
                if( FIELD_IS_SET(the_literal, Seq_data) &&
                    FIELD_EQUALS(the_literal, Length, 0) && 
                    the_literal.GetSeq_data().IsIupacna() ) 
                {
                    ERASE_DELTASEQ_IN_DELTAEXT( delta_seq_iter, delta_ext );
                    ChangeMade(CCleanupChange::eCleanDeltaExt);
                }
            }
        }
    }
}

void CNewCleanup_imp::x_GeneOntologyTermsBC( vector< CRef< CUser_field > > &go_terms )
{
    static const char * const sc_bsecGoFieldType[] = {
        "", "evidence", "go id", "go ref", "pubmed id", "text string"
    };
    typedef CStaticArraySet<const char*, PNocase_CStr> TGoFieldTypeSet;
    DEFINE_STATIC_ARRAY_MAP( TGoFieldTypeSet, sc_GoFieldArray, sc_bsecGoFieldType );

    NON_CONST_ITERATE( vector< CRef< CUser_field > >, term_iter, go_terms ) {
        CUser_field &field = **term_iter;
        if( TEST_FIELD_CHOICE( field, Data, NCBI_USERFIELD(Fields) ) ) {
            NON_CONST_ITERATE( vector< CRef< CUser_field > >, inner_term_iter, field.SetData().SetFields() ) {
                CUser_field &inner_field = **inner_term_iter;
                if( FIELD_IS_SET_AND_IS(inner_field, Label, Str) && 
                    TEST_FIELD_CHOICE( inner_field, Data, NCBI_USERFIELD(Str)) )
                {
                    const string &inner_label = inner_field.GetLabel().GetStr();
                    if( sc_GoFieldArray.find(inner_label.c_str()) != sc_GoFieldArray.end() ) {
                        if( NStr::EqualNocase(inner_label, "go id") ) {
                            if( s_RemoveInitial( inner_field.SetData().SetStr(), "GO:", NStr::eNocase ) ) {
                                ChangeMade(CCleanupChange::eCleanUserObjectOrField);
                            }
                        } else if( NStr::EqualNocase(inner_label, "go ref") ) {
                            if( s_RemoveInitial( inner_field.SetData().SetStr(), "GO_REF:", NStr::eNocase ) ) {
                                ChangeMade(CCleanupChange::eCleanUserObjectOrField);
                            }
                        }
                    }
                }
            }
        }
    }
}

void CNewCleanup_imp::UserObjectBC( CUser_object &user_object )
{    
    static const char * const sc_bsecGoQualType[] = {
        "", "Component", "Function", "Process"
    };
    typedef CStaticArraySet<const char*, PNocase_CStr> TGoQualTypeSet;
    DEFINE_STATIC_ARRAY_MAP( TGoQualTypeSet, sc_GoQualArray, sc_bsecGoQualType );

    if( FIELD_IS_SET_AND_IS( user_object, Type, Str) && 
        "GeneOntology" == GET_FIELD(user_object.GetType(), Str) ) 
    {
        EDIT_EACH_USERFIELD_ON_USEROBJECT( user_field_iter, user_object ) {
            CUser_field &field = **user_field_iter;
            if( TEST_FIELD_CHOICE( field, Data, NCBI_USERFIELD(Fields) ) &&
                FIELD_IS_SET_AND_IS(field, Label, Str) )
            {
                const string &label_str = GET_FIELD( field.GetLabel(), Str);
                if( sc_GoQualArray.find(label_str.c_str()) != sc_GoQualArray.end() ) {
                    x_GeneOntologyTermsBC( GET_MUTABLE(field.SetData(), Fields) );
                }
            }
        }
    }

    // clean type str
    if( FIELD_IS_SET_AND_IS(user_object, Type, Str) ) {
        x_CleanupStringMarkChanged( user_object.SetType().SetStr() );
    }

    // clean fields
    EDIT_EACH_USERFIELD_ON_USEROBJECT( user_field_iter, user_object ) {
        CUser_field &user_field = **user_field_iter;

        if( FIELD_IS_SET_AND_IS(user_field, Label, Str) ) {
            x_CleanupStringMarkChanged( user_field.SetLabel().SetStr() );
        }

        SWITCH_ON_USERFIELD_CHOICE(user_field) {
        case NCBI_USERFIELD(Str):
            x_CleanupStringMarkChanged( user_field.SetData().SetStr() );
            break;
        case NCBI_USERFIELD(Object):
            UserObjectBC( user_field.SetData().SetObject() );
            break;
        case NCBI_USERFIELD(Objects):
            NON_CONST_ITERATE( CUser_field::C_Data::TObjects, 
                user_obj_iter, user_field.SetData().SetObjects() ) 
            {
                UserObjectBC( **user_obj_iter );
            }
            break;
        case NCBI_USERFIELD(Strs):
            NON_CONST_ITERATE( CUser_field::C_Data::TStrs, str_iter, user_field.SetData().SetStrs() ) {
                x_CleanupStringMarkChanged( *str_iter );
            }
            break;
        default:
            break;
        }
    }

    x_CleanStructuredComment( user_object );
}

static int s_PcrPrimerCompare( 
    const CRef<CPCRPrimer> &p1, const CRef<CPCRPrimer> &p2 )
{
    if( p1.IsNull() || p2.IsNull() ) {
        return p2.IsNull() - p1.IsNull();
    }

    const string & name1 = ( p1->IsSetName() ? p1->GetName().Get() : kEmptyStr );
    const string & name2 = ( p2->IsSetName() ? p2->GetName().Get() : kEmptyStr );
    const int name_comparison = NStr::CompareCase(name1, name2);
    if( name_comparison != 0 ) {
        return name_comparison;
    }

    const string & seq1 = ( p1->IsSetSeq() ? p1->GetSeq().Get() : kEmptyStr );
    const string & seq2 = ( p2->IsSetSeq() ? p2->GetSeq().Get() : kEmptyStr );
    const int seq_comparison = NStr::CompareCase(seq1, seq2);
    return seq_comparison;
}

class CPcrPrimerRefLessThan {
public:

    bool operator()(
        const CRef<CPCRPrimer> &p1, const CRef<CPCRPrimer> &p2 ) const
    {
        return ( s_PcrPrimerCompare(p1, p2) < 0 );
    }
};

class CPCRPrimerRefEqual {
public:
    bool operator()( 
        const CRef<CPCRPrimer> & p1, const CRef<CPCRPrimer> & p2 ) const
    {
        return (0 == s_PcrPrimerCompare(p1, p2) );
    }
};

void CNewCleanup_imp::x_PCRPrimerSetBC( CPCRPrimerSet &primer_set )
{
    EDIT_EACH_PCRPRIMER_IN_PCRPRIMERSET( primer_iter, primer_set ) {
        CPCRPrimer &primer = **primer_iter;
        
        if( FIELD_IS_SET(primer, Seq) ) {
            string &seq = GET_MUTABLE(primer, Seq).Set();
            const string before = seq;
            x_CleanupStringMarkChanged(seq);
            CPCRPrimerSeq::Clean(seq);
            if( before != seq ) {
                ChangeMade(CCleanupChange::eChangePCRPrimers);
            }
            if( seq.empty() ) {
                RESET_FIELD(primer, Seq);
                ChangeMade(CCleanupChange::eChangePCRPrimers);
            }
        }

        if( FIELD_IS_SET(primer, Name) ) {
            string &name = GET_MUTABLE(primer, Name).Set();
            const string before = name;
            x_CleanupStringMarkChanged(name);
            if( before != name ) {
                ChangeMade(CCleanupChange::eChangePCRPrimers);
            }
            if( name.empty() ) {
                RESET_FIELD(primer, Name);
                ChangeMade(CCleanupChange::eChangePCRPrimers);
            }
        }

        if( ! FIELD_IS_SET(primer, Name) && ! FIELD_IS_SET(primer, Seq) ) {
            ERASE_PCRPRIMER_IN_PCRPRIMERSET(primer_iter, primer_set);
            ChangeMade(CCleanupChange::eChangePCRPrimers);
        }
    }

    UNIQUE_WITHOUT_SORT_PCRPRIMER_IN_PCRPRIMERSET( primer_set, CPcrPrimerRefLessThan );

    REMOVE_IF_EMPTY_PCRPRIMER_IN_PCRPRIMERSET( primer_set );
}

void CNewCleanup_imp::x_CopyGBBlockDivToOrgnameDiv( CSeq_entry &seq_entry)
{
    // find the "org" and find the "genbank"
    COrgName *orgname = NULL;
    CGB_block *gb_block = NULL;

    EDIT_EACH_SEQDESC_ON_SEQENTRY(desc_iter, seq_entry) {
        CSeqdesc &desc = **desc_iter;

        if( FIELD_IS(desc, Genbank) ) {
            gb_block = &desc.SetGenbank();
        } else if( FIELD_IS(desc, Org) && FIELD_IS_SET(desc.GetOrg(), Orgname) ) {
            orgname = &desc.SetOrg().SetOrgname();
        } else if( FIELD_IS(desc, Source) && FIELD_IS_SET(desc.GetSource(), Org) &&
            FIELD_IS_SET(desc.GetSource().GetOrg(), Orgname) )
        {
            orgname = &GET_MUTABLE(desc.SetSource().SetOrg(), Orgname);
        }
    }

    if( (NULL != orgname) && (NULL != gb_block) &&
        RAW_FIELD_IS_EMPTY_OR_UNSET(*orgname, Div) && 
        ! RAW_FIELD_IS_EMPTY_OR_UNSET(*gb_block, Div) )
    {
        SET_FIELD(*orgname, Div, GET_FIELD(*gb_block, Div) );
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
}

void CNewCleanup_imp::x_AuthListBCWithFixInitials( CAuth_list& al )
{
    AuthListBC( al, true );
}

void CNewCleanup_imp::x_AddNumToUserField( CUser_field &field )
{
    SWITCH_ON_USERFIELD_CHOICE( field ) {
    case NCBI_USERFIELD(Strs):
        SET_FIELD( field, Num, field.GetData().GetStrs().size() );
        ChangeMade(CCleanupChange::eCleanUserObjectOrField);
        break;
    case NCBI_USERFIELD(Ints):
        SET_FIELD( field, Num, field.GetData().GetInts().size() );
        ChangeMade(CCleanupChange::eCleanUserObjectOrField);
        break;
    case NCBI_USERFIELD(Reals):
        SET_FIELD( field, Num, field.GetData().GetReals().size() );
        ChangeMade(CCleanupChange::eCleanUserObjectOrField);
        break;
    case NCBI_USERFIELD(Oss):
        SET_FIELD( field, Num, field.GetData().GetOss().size() );
        ChangeMade(CCleanupChange::eCleanUserObjectOrField);
        break;
    default:
        break;
    }

}

void CNewCleanup_imp::x_PostProcessing(void)
{
    // convert muid to pmid, where possible
    if( ! m_MuidPubContainer.empty() ) {
        NON_CONST_ITERATE( TMuidPubContainer, pub_iter, m_MuidPubContainer ) {
            CPub &pub = **pub_iter;
            const int muid = pub.GetMuid();
            
            // attempt to find that muid in the muid-to-pmid mapping created earlier
            TMuidToPmidMap::const_iterator map_iter = m_MuidToPmidMap.find(muid);
            if( map_iter != m_MuidToPmidMap.end() ) {
                const int pmid = map_iter->second;
                pub.SetPmid().Set(pmid);
                ChangeMade(CCleanupChange::eChangePublication);
            }
        }

        m_MuidPubContainer.clear();
    }

    // update cit-gens that pointed to obsolete pubs

    if( ! m_OldLabelToPubMap.empty() && ! m_PubToNewPubLabelMap.empty() &&
        ! m_SeqFeatCitPubContainer.empty() ) 
    {
        NON_CONST_ITERATE( TSeqFeatCitPubContainer, pub_iter, m_SeqFeatCitPubContainer ) {
            CPub &pub = **pub_iter;

            if( FIELD_IS(pub, Gen) && FIELD_IS_SET(pub.GetGen(), Cit) ) {
                CCit_gen &gen = pub.SetGen();
                const string &cit = gen.GetCit();

                TOldLabelToPubMap::const_iterator iter   = m_OldLabelToPubMap.lower_bound(cit);
                TOldLabelToPubMap::const_iterator finish = m_OldLabelToPubMap.upper_bound(cit);
                for( ; iter != finish; ++iter ) {
                    CRef<CPub> referenced_pub = iter->second;
                    const string &new_label = m_PubToNewPubLabelMap[referenced_pub];
                    if( ! new_label.empty() && cit != new_label ) {
                        gen.SetCit( new_label );
                        ChangeMade(CCleanupChange::eCleanCitonFeat);
                        break;
                    }
                }
            }
        }
    }

    // sometimes Seq-feat.cit.pub.gen items are cut off, so we try to fill them out
    if( ! m_PubdescCitGenLabelVec.empty() && ! m_SeqFeatCitPubContainer.empty() ) {
        NON_CONST_ITERATE( TSeqFeatCitPubContainer, pub_iter, m_SeqFeatCitPubContainer ) {
            CPub &pub = **pub_iter;

            if( pub.IsGen() ) {
                CCit_gen &gen = GET_MUTABLE(pub, Gen);
                if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(gen, Cit) ) {
                    const string &cit = GET_FIELD(gen, Cit);
                    if( (cit.length() > 1) && NStr::EndsWith(cit, ">") ) {
                        string cit_copy = cit;
                        cit_copy.resize( cit_copy.length() - 1 ); // chop off final ">"
                        s_RegexpReplace( cit_copy, "Unpublished[ ]+", "Unpublished", 1 );

                        // check if the cit is a strict prefix of any of the cit-gen labels from before
                        ITERATE( TPubdescCitGenLabelVec, label_iter, m_PubdescCitGenLabelVec) {
                            const string &label = *label_iter;
                            if( (label.length() > cit_copy.length()) && NStr::StartsWith(label, cit_copy) ) {
                                gen.SetCit( label );
                                ChangeMade(CCleanupChange::eCleanCitonFeat);
                                break;
                            }
                        }
                    }
                }
            }
        }
    }
}


void CNewCleanup_imp::x_NotePubdescOrAnnotPubs( 
    const CPub_equiv &pub_equiv )
{
    int muid = 0;
    int pmid = 0;

    x_NotePubdescOrAnnotPubs_RecursionHelper( pub_equiv, muid, pmid );

    // If a pub-equiv contains a muid and pmid, we assume they're 
    // equivalent.
    if( (muid > 0) && (pmid > 0) ) {
        m_MuidToPmidMap[muid] = pmid;
    }
}

void CNewCleanup_imp::x_NotePubdescOrAnnotPubs_RecursionHelper(
    const CPub_equiv &pub_equiv, int &muid, int &pmid ) 
{
    FOR_EACH_PUB_ON_PUBEQUIV(pub_iter, pub_equiv) {
        const CPub &pub = **pub_iter;
        switch( pub.Which() ) {
        case NCBI_PUB(Muid):
            muid = pub.GetMuid();
            break;
        case NCBI_PUB(Pmid):
            pmid = pub.GetPmid().Get();
            break;
        case NCBI_PUB(Gen): 
            {
                const CCit_gen &gen = pub.GetGen();
                if( gen.IsSetCit() || gen.IsSetJournal() || gen.IsSetDate() || gen.IsSetSerial_number() ) {
                    m_PubdescCitGenLabelVec.push_back( kEmptyStr );
                    string &label = m_PubdescCitGenLabelVec.back();
                    pub.GetLabel( &label, CPub::eContent, true );
                }
            }
            break;
        case NCBI_PUB(Equiv):
            x_NotePubdescOrAnnotPubs_RecursionHelper( pub.GetEquiv(), muid, pmid );
            break;
        default:
            break;
        }
    }
}

void CNewCleanup_imp::x_RememberPubOldLabel( CPub &pub )
{
    string old_label;
    pub.GetLabel( &old_label, CPub::eContent, true);

    m_OldLabelToPubMap.insert( TOldLabelToPubMap::value_type(old_label, CRef<CPub>(&pub)) );
}

void CNewCleanup_imp::x_RememberMuidThatMightBeConvertibleToPmid( int &muid, CPub &pub )
{
    // ignore the "muid" arg; it's just so we only add muid pubs to the container

    m_MuidPubContainer.push_back( CRef<CPub>(&pub) );
}

void CNewCleanup_imp::x_RememberSeqFeatCitPubs( CPub &pub )
{
    switch( pub.Which() ) {
    case NCBI_PUB(Equiv):
        // recurse into equivs
        EDIT_EACH_PUB_ON_PUBEQUIV( pub_iter, GET_MUTABLE(pub, Equiv) ) {
            x_RememberSeqFeatCitPubs( **pub_iter );
        }
        break;
    default:
        m_SeqFeatCitPubContainer.push_back( CRef<CPub>(&pub) );
        break;
    }
}

void CNewCleanup_imp::x_DecodeXMLMarkChanged( std::string & str )
{
    // This is more complex than you might initially think is necessary
    // because this needs to be as efficient as possible since it's
    // called on every single string in an object.

    SIZE_TYPE amp = str.find('&');
    if( NPOS == amp ) {
        // Check for the common case of no replacements required
        return;
    }

    // transformations done by this function:
    const static struct {
        string src_word;
        string result_word;
    } transformations[] = {
        // all start with an implicit ampersand
        // and end with an implicit semi-colon
        { "amp",      "&"      },
        { "apos",     "\'"     },
        { "gt",       ">"      },
        { "lt",       "<"      },
        { "quot",     "\""     },
        { "#13;&#10", ""       },
        { "#916",     "Delta"  },
        { "#945",     "alpha"  },
        { "#946",     "beta"   },
        { "#947",     "gamma"  },
        { "#952",     "theta"  },
        { "#955",     "lambda" },
        { "#956",     "mu"     },
        { "#957",     "nu"     },
        { "#8201",    ""       },
        { "#8206",    ""       },
        { "#8242",    "'"      },
        { "#8594",    "->"     },
        { "#8722",    "-"      },
        { "#8710",    "delta"  },
        { "#64257",   "fi"     },
        { "#64258",   "fl"     },
        { "#65292",   ","      }
    };

    // Collisions should be rare enough that the CFastMutex is
    // faster than recreating the searcher each time this function is called
    static CTextFsm<int> searcher;
    // set searcher's state, if not already done
    {
        // just in case of the tiny chance that two threads try to prime
        // the searcher at the same time.
        static CFastMutex searcher_mtx;
        CFastMutexGuard searcher_mtx_guard( searcher_mtx );
        if( ! searcher.IsPrimed() ) {
            for( size_t idx = 0;
                idx < sizeof(transformations)/sizeof(transformations[0]); 
                ++idx ) 
            {
                // match type is index into transformations array
                searcher.AddWord( transformations[idx].src_word, idx );
            }
            searcher.Prime();
        }
    }

    // a smart compiler probably won't need this manual optimization,
    // but just in case.
    const SIZE_TYPE str_len = str.length();

    // fill result up to the first '&'
    string result;
    result.reserve( str_len ); 
    copy( str.begin(), str.begin() + amp,
        back_inserter(result) );

    bool change_made = false;

    // at the start of each loop, the result is filled in
    // up to the ampersand (amp)
    while( amp != NPOS && amp < str_len ) {

        // find out what the ampersand code represents
        // (if it represents anything)
        int state = searcher.GetInitialState();
        SIZE_TYPE search_pos = (amp + 1);
        for( ; search_pos < str_len ; ++search_pos ) {
            const char ch = str[search_pos];
            if( ch == ';' ) {
                break;
            }
            if( ch == '&' ) {
                --search_pos; // so we don't skip over the '&'
                state = searcher.GetInitialState(); // force "no-match"
                break;
            }
            state = searcher.GetNextState(state, ch);
        }

        if( search_pos >= str_len ) {
            // we reached the end without finding anything, so
            // copy the rest and break
            copy( str.begin() + amp, str.end(),
                back_inserter(result) );
            break;
        }

        if( searcher.IsMatchFound(state) ) {
            // copy the translation of the XML code:
            _ASSERT( searcher.GetMatches(state).size() == 1 );
            const int match_idx = searcher.GetMatches(state)[0];
            const string & result_word = transformations[match_idx].result_word;
            copy( result_word.begin(), result_word.end(),
                back_inserter(result) );
            change_made = true;
        } else {
            // no match found, so copy the text we looked at
            // as-is
            copy( str.begin() + amp, str.begin() + search_pos + 1,
                back_inserter(result) );
        }

        // find next_amp
        if( str[search_pos] == '&' ) {
            // special case that occurs when there are multiple '&' together
            ++search_pos;
            result += '&';
        }
        SIZE_TYPE next_amp = str.find('&', search_pos );
        if( NPOS == next_amp ) {
            // no more amps; copy the rest and break
            copy( str.begin() + search_pos + 1, str.end(),
                back_inserter(result) );
            break;
        }

        // copy up to the next amp
        if( (search_pos + 1) < next_amp ) {
            copy( str.begin() + search_pos + 1, str.begin() + next_amp,
                back_inserter(result) );
        }
        amp = next_amp;
    }

    // mark changes, if any
    if( change_made ) {
        // swap should be faster than assignment ( amortized O(1) )
        str.swap( result );
        ChangeMade(CCleanupChange::eDecodeXML);
    }
}

// maps the type of seqdesc to the order it should be in 
// (lowest to highest)
typedef SStaticPair<CSeqdesc::E_Choice, int>  TSeqdescOrderElem;
static const TSeqdescOrderElem sc_seqdesc_order_map[] = {
    // Note that ordering must match ordering
    // in CSeqdesc::E_Choice
    { CSeqdesc::e_Mol_type,    13 },
    { CSeqdesc::e_Modif,       14 },
    { CSeqdesc::e_Method,      15 },
    { CSeqdesc::e_Name,         7 },
    { CSeqdesc::e_Title,        1 },
    { CSeqdesc::e_Org,         16 },
    { CSeqdesc::e_Comment,      6 },
    { CSeqdesc::e_Num,         11 },
    { CSeqdesc::e_Maploc,       9 },
    { CSeqdesc::e_Pir,         18 },
    { CSeqdesc::e_Genbank,     22 },
    { CSeqdesc::e_Pub,          5 },
    { CSeqdesc::e_Region,      10 },
    { CSeqdesc::e_User,         8 },
    { CSeqdesc::e_Sp,          17 },
    { CSeqdesc::e_Dbxref,      12 },
    { CSeqdesc::e_Embl,        21 },
    { CSeqdesc::e_Create_date, 24 },
    { CSeqdesc::e_Update_date, 25 },
    { CSeqdesc::e_Prf,         19 },
    { CSeqdesc::e_Pdb,         20 },
    { CSeqdesc::e_Het,          4 },
    { CSeqdesc::e_Source,       2 },
    { CSeqdesc::e_Molinfo,      3 },
    { CSeqdesc::e_Modelev,     23 }
};
typedef CStaticPairArrayMap<CSeqdesc::E_Choice, int> TSeqdescOrderMap;
DEFINE_STATIC_ARRAY_MAP(TSeqdescOrderMap, sc_SeqdescOrderMap, sc_seqdesc_order_map);

static
int s_SeqDescToOrdering( const CRef<CSeqdesc> &desc ) {
    // ordering assigned to unknown
    const int unknown_seqdesc = (1 + sc_SeqdescOrderMap.size());

    TSeqdescOrderMap::const_iterator find_iter = sc_SeqdescOrderMap.find(desc->Which());
    if( find_iter == sc_SeqdescOrderMap.end() ) {
        return unknown_seqdesc;
    }

    return find_iter->second;
}

static
bool s_SeqDescLessThan( const CRef<CSeqdesc> &desc1, const CRef<CSeqdesc> &desc2 )
{
    return ( s_SeqDescToOrdering(desc1) < s_SeqDescToOrdering(desc2) );
}

void CNewCleanup_imp::x_SortSeqDescs( CSeq_entry & seq_entry )
{
    if( ! SEQDESC_ON_SEQENTRY_IS_SORTED(seq_entry, s_SeqDescLessThan) ) {
        SORT_SEQDESC_ON_SEQENTRY(seq_entry, s_SeqDescLessThan);
        ChangeMade( CCleanupChange::eMoveDescriptor );
    }
}

namespace {
    // T can be CBioseq or CBioseq_set.
    // Logic is basically the same for bioseq and bioseq-set, but since they
    // don't share the right functions in the class inheritance hierarchy, we
    // have to use templates instead of polymorphism.

    // returns true if change made
    template <class T>
    bool x_RemoveDupBioSourceImpl( T & obj )
    {
        if( ! obj.IsSetDescr() || ! obj.SetDescr().IsSet() || 
            obj.SetDescr().Set().empty() ) 
        {
            // nothing to remove
            return false;
        }

        CSeq_descr::Tdata & descr_vec = obj.SetDescr().Set();

        // erase BioSources that are equal to a BioSource in some ancestor
        // Bioseq-set
        typedef vector<CSeq_descr::Tdata::iterator> TBioSrcIterVec;
        TBioSrcIterVec sourcesToErase;
        NON_CONST_ITERATE( CSeq_descr::Tdata, descr_iter, descr_vec ) {
            if( ! (*descr_iter)->IsSource() ) {
                continue;
            }

            // climb the hierarchy looking for identical BioSource
            bool bShouldEraseDescr = false;
            CConstRef< CBioseq_set > pParent = obj.GetParentSet();
            for( ; pParent; pParent = pParent->GetParentSet() ) {
                if( ! pParent->IsSetDescr() || ! pParent->GetDescr().IsSet() ) {
                    continue;
                }
                ITERATE( CSeq_descr::Tdata, parent_descr_iter, pParent->GetDescr().Get() ) {
                    if( ! (*parent_descr_iter)->IsSource() ) {
                        continue;
                    }
                    if( (*parent_descr_iter)->Equals(**descr_iter) ) {
                        bShouldEraseDescr = true;
                        break;
                    }
                }
                if( bShouldEraseDescr ) {
                    break;
                }
            }
            if( bShouldEraseDescr ) {
                sourcesToErase.push_back(descr_iter);
            }
        }
        // erase the BioSources we've decided to erase
        NON_CONST_ITERATE(TBioSrcIterVec, iter_iter, sourcesToErase) {
            descr_vec.erase(*iter_iter);
        }
        return ! sourcesToErase.empty();
    }
}

void CNewCleanup_imp::x_RemoveDupBioSource( CBioseq & bioseq )
{
    if( x_RemoveDupBioSourceImpl( bioseq ) ) {
        ChangeMade( CCleanupChange::eRemoveDupBioSource );
    }
}

void CNewCleanup_imp::x_RemoveDupBioSource( CBioseq_set & bioseq_set )
{
    if( x_RemoveDupBioSourceImpl( bioseq_set ) ) {
        ChangeMade( CCleanupChange::eRemoveDupBioSource );
    }
}

void CNewCleanup_imp::x_FixStructuredCommentKeywords( CBioseq & bioseq )
{
    vector<string> controlled_keywords = CComment_rule::GetKeywordList();
    vector<string> original_keywords;

    EDIT_EACH_SEQDESC_ON_BIOSEQ ( itr, bioseq ) {
        CSeqdesc& desc = **itr;
        if ( desc.Which() != CSeqdesc::e_Genbank ) continue;
        CGB_block& gb_block = desc.SetGenbank();
        EDIT_EACH_KEYWORD_ON_GENBANKBLOCK (k_itr, gb_block) {
            original_keywords.push_back(*k_itr);
            FOR_EACH_STRING_IN_VECTOR ( s_itr, controlled_keywords ) {
                if (NStr::EqualNocase (*k_itr, *s_itr)) {
                    ERASE_KEYWORD_ON_GENBANKBLOCK (k_itr, gb_block);
                    break;
                }
            }
        }
        if (gb_block.IsSetKeywords() && gb_block.GetKeywords().size() == 0) {
            gb_block.ResetKeywords();
        }
        if (gb_block.IsEmpty()) {
            ERASE_SEQDESC_ON_BIOSEQ ( itr, bioseq );
        }
    }

    vector<string> new_keywords;
    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(bioseq);
    for (CSeqdesc_CI di(bsh, CSeqdesc::e_User); di; ++di) {
        const CUser_object& usr = di->GetUser();
        if ( ! CComment_rule::IsStructuredComment (usr) ) continue;
        try {
            string prefix = CComment_rule::GetStructuredCommentPrefix (usr);
            CConstRef<CComment_set> comment_rules = CComment_set::GetCommentRules();
            try {
                const CComment_rule& rule = comment_rules->FindCommentRule(prefix);
                CComment_rule::TErrorList errors = rule.IsValid(usr);
                if (errors.size() == 0) {
                    string kywd = CComment_rule::KeywordForPrefix( prefix );
                    if (! kywd.empty()) {
                        new_keywords.push_back(kywd);
                    }
                }
            } catch (CException) {
            }
        } catch (CException) {
        }
    }
    vector<string> final_keywords;
    if (new_keywords.size() > 0) {
        CGB_block *gb_block = NULL;
        EDIT_EACH_SEQDESC_ON_BIOSEQ ( itr, bioseq ) {
            CSeqdesc& desc = **itr;
            if ( desc.Which() != CSeqdesc::e_Genbank ) continue;
            gb_block = &desc.SetGenbank();
        }
        if (! gb_block) {
            CRef<CSeqdesc> new_desc ( new CSeqdesc );
            gb_block = &(new_desc->SetGenbank());
            bioseq.SetDescr().Set().push_back( new_desc );
        }
        if (gb_block->IsSetKeywords()) {
            FOR_EACH_KEYWORD_ON_GENBANKBLOCK (k_itr, *gb_block) {
                final_keywords.push_back(*k_itr);
            }
        }
        FOR_EACH_STRING_IN_VECTOR ( n_itr, new_keywords ) {
            ADD_KEYWORD_TO_GENBANKBLOCK (*gb_block, *n_itr);
            final_keywords.push_back(*n_itr);
        }
    }
    bool any_change = false;
    vector<string>::iterator orig_k = original_keywords.begin();
    vector<string>::iterator final_k = final_keywords.begin();
    while (!any_change && orig_k != original_keywords.end() && final_k != final_keywords.end()) {
        if (!NStr::Equal(*orig_k, *final_k)) {
            any_change = true;
        }
        orig_k++;
        final_k++;
    }
    if (orig_k != original_keywords.end() || final_k != final_keywords.end()) {
        any_change = true;
    }
    if (any_change) {
        ChangeMade(CCleanupChange::eChangeKeywords);
    }
}

void CNewCleanup_imp::x_RemoveProtDescThatDupsProtName( CProt_ref & prot )
{
    // remove prot desc if it matches any prot name
    if (prot.IsSetDesc()) {
        const CProt_ref::TDesc& desc = prot.GetDesc();
        FOR_EACH_NAME_ON_PROTREF (it, prot) {
            if (NStr::EqualNocase(desc, *it)) {
                prot.ResetDesc();
                ChangeMade(CCleanupChange::eChangeQualifiers);
                break;
            }
        }
    }
}

void CNewCleanup_imp::x_RemoveRedundantComment( CGene_ref& gene, CSeq_feat & seq_feat )
{
    if( FIELD_IS_SET(seq_feat, Comment) ) {
        const string & comm = GET_FIELD(seq_feat, Comment);
        if ( STRING_FIELD_MATCH (gene, Desc, comm) ) {
            RESET_FIELD(gene, Desc);
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    }
}

void CNewCleanup_imp::x_RemoveEmptyUserObject( CSeq_descr & seq_descr )
{
    EDIT_EACH_SEQDESC_ON_SEQDESCR( descr_iter, seq_descr ) {
        CSeqdesc &desc = **descr_iter;
        if( ! FIELD_IS(desc, User) ) {
            continue;
        }

        bool needs_removal = false;

        // remove user-objects with no type
        CUser_object & user_obj = GET_MUTABLE(desc, User);
        if( ! FIELD_IS_SET(user_obj, Type) || 
            ( FIELD_IS(user_obj.GetType(), Str) && user_obj.GetType().GetStr().empty() ) ) 
        {
            needs_removal = true;
        }

        // get type string, if any
        const string *pTypeStr = &kEmptyStr;
        if( FIELD_IS_SET_AND_IS(user_obj, Type, Str) ) {
            pTypeStr = &(user_obj.GetType().GetStr());
        }

        // remove user-objects with no data (except certain types)
        if( RAW_FIELD_IS_EMPTY_OR_UNSET(user_obj, Data) && 
            ! NStr::EqualNocase(*pTypeStr, "NcbiAutofix") &&
            ! NStr::EqualNocase(*pTypeStr, "Unverified") )
        {
            needs_removal = true;
        }

        if( needs_removal ) {
            ERASE_SEQDESC_ON_SEQDESCR(descr_iter, seq_descr);
            ChangeMade(CCleanupChange::eRemoveDescriptor);
        }
    }
}

bool CNewCleanup_imp::x_ShouldRemoveEmptyGene(CGene_ref& gene)
{
    bool should_remove = false;
    if (gene.IsSetLocus() &&
        NStr::IsBlank(gene.GetLocus())) {
        gene.ResetLocus();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (gene.IsSetAllele() &&
        NStr::IsBlank(gene.GetAllele())) {
        gene.ResetAllele();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (gene.IsSetDesc() &&
        NStr::IsBlank(gene.GetDesc())) {
        gene.ResetDesc();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (gene.IsSetMaploc() &&
        NStr::IsBlank(gene.GetMaploc())) {
        gene.ResetMaploc();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (gene.IsSetLocus_tag() &&
        NStr::IsBlank(gene.GetLocus_tag())) {
        gene.ResetLocus_tag();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (gene.IsSetDb() && gene.GetDb().empty()) {
        gene.ResetDb();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (gene.IsSetSyn() && gene.GetSyn().empty()) {
        gene.ResetSyn();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (!gene.IsSetLocus() &&
        !gene.IsSetAllele() &&
        !gene.IsSetDesc() &&
        !gene.IsSetMaploc() &&
        !gene.IsSetLocus_tag() &&
        !gene.IsSetDb() &&
        !gene.IsSetSyn()) {
        should_remove = true;
    }
    return should_remove;
}

            
bool CNewCleanup_imp::x_ShouldRemoveEmptyProt( CProt_ref& prot )
{
    if (prot.IsSetProcessed() &&
        (prot.GetProcessed() == CProt_ref::eProcessed_signal_peptide ||
         prot.GetProcessed() == CProt_ref::eProcessed_transit_peptide)) {
        return false;
    }

    bool should_remove = false;
    if (prot.IsSetName() && 
        (prot.GetName().empty() || 
         NStr::IsBlank(prot.GetName().front()))) {
        prot.ResetName();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (prot.IsSetEc() && prot.GetEc().empty()) {
        prot.ResetEc();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (prot.IsSetDb() && prot.GetDb().empty()) {
        prot.ResetDb();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (prot.IsSetActivity() && prot.GetActivity().empty()) {
        prot.ResetActivity();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (prot.IsSetDesc() && NStr::IsBlank(prot.GetDesc())) {
        prot.ResetDesc();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (!prot.IsSetName() &&
        !prot.IsSetDesc() &&
        !prot.IsSetEc() &&
        !prot.IsSetActivity() &&
        !prot.IsSetDb()) {
        should_remove = true;
    }
    return should_remove;
}
           
bool CNewCleanup_imp::x_ShouldRemoveEmptyPub(CPubdesc& pub)
{
    return false;
}

bool CNewCleanup_imp::x_ShouldRemoveEmptyFeature( CSeq_feat& feat)
{
    bool is_empty = false;

    if (!feat.IsSetData()) {
        return false;
    }
    switch (feat.GetData().Which()) {
        case CSeqFeatData::e_Gene:
            is_empty = x_ShouldRemoveEmptyGene(feat.SetData().SetGene());
            break;
        case CSeqFeatData::e_Prot:
            is_empty = x_ShouldRemoveEmptyProt(feat.SetData().SetProt());
            break;
        case CSeqFeatData::e_Pub:
            is_empty = x_ShouldRemoveEmptyPub(feat.SetData().SetPub());
            break;
        case CSeqFeatData::e_Comment:
            if (!feat.IsSetComment() || NStr::IsBlank(feat.GetComment())) {
                is_empty = true;
            }
            break;
        default:
            break;
    }
    return is_empty;
}

void CNewCleanup_imp::x_RemoveEmptyFeatures( CSeq_annot & seq_annot )
{
    if (seq_annot.IsFtable()) {
        CSeq_annot::C_Data::TFtable::iterator it = seq_annot.SetData().SetFtable().begin();
        while (it != seq_annot.SetData().SetFtable().end()) {
            if (x_ShouldRemoveEmptyFeature(**it)) {
                it = seq_annot.SetData().SetFtable().erase(it);
                ChangeMade(CCleanupChange::eRemoveFeat);
            } else {
                it++;
            }
        }
    }
}

void CNewCleanup_imp::x_RemoveEmptyFeatureTables( list< CRef< CSeq_annot > >& annot_list)
{
    list< CRef< CSeq_annot > >::iterator it = annot_list.begin();
    while (it != annot_list.end()) {
        if ((*it)->IsFtable()) {
            x_RemoveEmptyFeatures(**it);
            if ((*it)->GetData().GetFtable().empty()) {
                it = annot_list.erase(it);
                ChangeMade(CCleanupChange::eRemoveAnnot);
            } else {
                it++;
            }
        } else {
            it++;
        }
    }
}


void CNewCleanup_imp::x_RemoveEmptyFeatureTables( CBioseq & bioseq )
{
    if (bioseq.IsSetAnnot()) {
        x_RemoveEmptyFeatureTables(bioseq.SetAnnot());
        if (bioseq.GetAnnot().empty()) {
            bioseq.ResetAnnot();
        }
    }
}

void CNewCleanup_imp::x_RemoveEmptyFeatureTables( CBioseq_set & bioseq_set )
{
    if (bioseq_set.IsSetAnnot()) {
        x_RemoveEmptyFeatureTables(bioseq_set.SetAnnot());
        if (bioseq_set.GetAnnot().empty()) {
            bioseq_set.ResetAnnot();
        }
    }
}


void CNewCleanup_imp::x_BioseqSetEC( CBioseq_set & bioseq_set )
{
    // put general Bioseq-set cleanup here:
    // ...

    // special logic for various bioseq_set types:
    switch( GET_FIELD_OR_DEFAULT(
        bioseq_set, Class, NCBI_BIOSEQSETCLASS(not_set)) )
    {
    case NCBI_BIOSEQSETCLASS(nuc_prot):
        x_BioseqSetNucProtEC( bioseq_set );
        break;
    default:
        // no special logic for other bioseq-set classes
        break;
    }
}

// this is for CNewCleanup_imp::x_BioseqSetNucProtEC.
// It's out here because C++ doesn't like templates on
// local types.
namespace {
    // this holds info about the dblinks we've found
    struct SDblinkDeleteInfo
    {
        CSeq_descr_Base::Tdata::iterator  pDBLinkDesc_iter;
        CRef<CBioseq>                     pDBLinkDescBioseq;
    };
}

void CNewCleanup_imp::x_BioseqSetNucProtEC( CBioseq_set & bioseq_set )
{
    // if nuc-prot set has exactly one DBLink user-object on its
    // descendent bioseqs, move it to the nuc-prot set.
    // (identical DBLinks count as one)

    // bail if there is a DBLinkDesc on the bioseq_set itself
    FOR_EACH_SEQDESC_ON_SEQSET(desc_it, bioseq_set) {
        if( x_IsDBLinkUserObj(**desc_it) ) {
            return;
        }
    }

    typedef vector<SDblinkDeleteInfo> TDblinkDeleteInfoVec;
    TDblinkDeleteInfoVec dblinksToDeleteVec;
    
    // check for descendent dblinks
    VISIT_ALL_SEQENTRYS_WITHIN_SEQSET( entry_it, bioseq_set )
    {
        CRef<CSeq_entry> pEntry( & const_cast<CSeq_entry&>(*entry_it) );
        EDIT_EACH_SEQDESC_ON_SEQENTRY(desc_it, *pEntry ) 
        {
            if( ! x_IsDBLinkUserObj(**desc_it) ) {
                // ignore other types of user objects
                continue;
            }

            if( ! pEntry->IsSeq() ) {
                // Found a DBLink on some descendent bioseq-set,
                // so we bail out
                return;
            }

            // there has already been a dblink.  make sure it's
            // identical
            if( ! dblinksToDeleteVec.empty() ) {
                const CSeqdesc & last_dblink = 
                    **dblinksToDeleteVec.rbegin()->pDBLinkDesc_iter;
                // bail out if there is more than one DBLink user object,
                // and they are NOT identical
                if( ! (*desc_it)->Equals(last_dblink) ) {
                    return;
                }
            }

            SDblinkDeleteInfo dblink_to_delete;
            dblink_to_delete.pDBLinkDesc_iter = desc_it;
            dblink_to_delete.pDBLinkDescBioseq = Ref(&pEntry->SetSeq());
            dblinksToDeleteVec.push_back( dblink_to_delete );
        }
    }

    // delete dblinks that we're s
    if( ! dblinksToDeleteVec.empty() ) {
        // give the parent bioseq-set a copy of the dblink
        CRef<CSeqdesc> pDblinkForParent( 
            SerialClone(
            *dblinksToDeleteVec.begin()->pDBLinkDesc_iter->GetPointer()) );
        ADD_SEQDESC_TO_SEQSET(bioseq_set, pDblinkForParent);

        // delete the dblinks below the parent
        NON_CONST_ITERATE( 
            TDblinkDeleteInfoVec, dblink_delete_info, dblinksToDeleteVec ) 
        {
            ERASE_SEQDESC_ON_BIOSEQ(
                dblink_delete_info->pDBLinkDesc_iter,
                *dblink_delete_info->pDBLinkDescBioseq );
        }
    }
}

bool CNewCleanup_imp::x_IsDBLinkUserObj( const CSeqdesc & desc )
{
        if( ! desc.IsUser() ) {
            return false;
        }

        const CUser_object & user_obj = desc.GetUser();

        if( ! FIELD_IS_SET_AND_IS(user_obj, Type, Str) ) {
            return false;
        }

        return ( user_obj.GetType().GetStr() == "DBLink" );
}

// neg for "<", 0 for "==", and pos for ">"
static int s_PcrPrimerSetCompare( const CPCRPrimerSet &s1, const CPCRPrimerSet &s2 )
{
    // it's highly unlikely for this if-statement to trigger, but just in case...
    if( ! s1.IsSet() || ! s2.IsSet() ) {
        return int(s1.IsSet()) - int(s2.IsSet());
    }

    // put the primers into a set so that our comparison doesn't worry about order or dups
    typedef set< CRef<CPCRPrimer>, CPcrPrimerRefLessThan > TPrimerContainer;
    TPrimerContainer primer_set_1;
    TPrimerContainer primer_set_2;

    copy( s1.Get().begin(), s1.Get().end(), inserter(primer_set_1, primer_set_1.begin()) );
    copy( s2.Get().begin(), s2.Get().end(), inserter(primer_set_2, primer_set_2.begin()) );

    // smaller first
    if( primer_set_1.size() != primer_set_2.size() ) {
        return (primer_set_1.size() - primer_set_2.size());
    }

    // find so we can compare
    pair<TPrimerContainer::const_iterator, TPrimerContainer::const_iterator> mismatch_iter = 
        mismatch( primer_set_1.begin(), primer_set_1.end(), primer_set_2.begin(), CPCRPrimerRefEqual() );
    if( mismatch_iter.first == primer_set_1.end() ) {
        // no mismatch; they're equal
        return 0;
    }

    const int mismatch_compare = s_PcrPrimerCompare(*mismatch_iter.first, *mismatch_iter.second);
    return mismatch_compare;
}

class CPcrReactionLessThan {
public:

    bool operator()( 
        const CRef<CPCRReaction> &r1, const CRef<CPCRReaction> &r2 ) const
    {
        if( r1.IsNull() || r2.IsNull() ) {
            return r1.IsNull() && ! r2.IsNull();
        }

        // compare on forward, then reverse
        if( r1->IsSetForward() != r2->IsSetForward() ) {
            // note where the "!" operator is and isn't
            return ! r1->IsSetForward() && r2->IsSetForward(); 
        }
        if( r1->IsSetForward() && r2->IsSetForward() ) {
            const int forward_comparison = s_PcrPrimerSetCompare( r1->GetForward(), r2->GetForward() );
            if( forward_comparison != 0 ) {
                return (forward_comparison < 0);
            }
        }

        if( ! r1->IsSetReverse() && ! r2->IsSetReverse() ) {
            // note where the "!" operator is and isn't
            return ! r1->IsSetReverse() && r2->IsSetReverse();
        }
        return ( s_PcrPrimerSetCompare( r1->GetReverse(), r2->GetReverse() ) < 0 );
    }

};

void CNewCleanup_imp::PCRReactionSetBC( CPCRReactionSet &pcr_reaction_set )
{
    EDIT_EACH_PCRREACTION_IN_PCRREACTIONSET( reaction_iter, pcr_reaction_set ) {
        CPCRReaction &reaction = **reaction_iter;

        if( FIELD_IS_SET(reaction, Forward) ) {
            x_PCRPrimerSetBC( GET_MUTABLE(reaction, Forward) );
            if( ! GET_FIELD(reaction, Forward).IsSet() || GET_FIELD(reaction, Forward).Get().empty() ) {
                RESET_FIELD(reaction, Forward);
                ChangeMade(CCleanupChange::eChangePCRPrimers);
            }
        }

        if( FIELD_IS_SET(reaction, Reverse) ) {
            x_PCRPrimerSetBC( GET_MUTABLE(reaction, Reverse) );
            if( ! GET_FIELD(reaction, Reverse).IsSet() || GET_FIELD(reaction, Reverse).Get().empty() ) {
                RESET_FIELD(reaction, Reverse);
                ChangeMade(CCleanupChange::eChangePCRPrimers);
            }
        }

        if( ! FIELD_IS_SET(reaction, Forward) && ! FIELD_IS_SET(reaction, Reverse) ) {
            ERASE_PCRREACTION_IN_PCRREACTIONSET(reaction_iter, pcr_reaction_set);
            ChangeMade(CCleanupChange::eChangePCRPrimers);
        }
    }

    UNIQUE_WITHOUT_SORT_PCRREACTION_IN_PCRREACTIONSET( pcr_reaction_set, CPcrReactionLessThan );

    REMOVE_IF_EMPTY_PCRREACTION_IN_PCRREACTIONSET( pcr_reaction_set );
}

void CNewCleanup_imp::MolInfoBC( CMolInfo &molinfo )
{
    if( FIELD_EQUALS(molinfo, Tech, NCBI_BIOMOL(unknown) ) ) {
        RESET_FIELD(molinfo, Tech);
        ChangeMade(CCleanupChange::eChangeMolInfo);
    }

    if( FIELD_EQUALS(molinfo, Completeness, NCBI_COMPLETENESS(unknown) ) ) {
        RESET_FIELD(molinfo, Completeness);
        ChangeMade(CCleanupChange::eChangeMolInfo);
    }
}

void CNewCleanup_imp::ExtendedCleanupSeqEntry (
    CSeq_entry& seq_entry
)

{
    // extended cleanup includes basic cleanup
    BasicCleanupSeqEntry( seq_entry );

    if( ! (m_Options & CCleanup::eClean_NoNcbiUserObjects) ) {
        x_AddNcbiCleanupObject(seq_entry);
    }

    CAutogeneratedExtendedCleanup auto_ext_cleanup( *m_Scope, *this );
    auto_ext_cleanup.ExtendedCleanupSeqEntry( seq_entry );

    // TODO: implement more of ExtendedCleanup
}

void CNewCleanup_imp::ExtendedCleanupSeqSubmit (
    CSeq_submit& ss
)
{
    // extended cleanup includes basic cleanup
    BasicCleanupSeqSubmit( ss );

    CAutogeneratedExtendedCleanup auto_ext_cleanup( *m_Scope, *this );
    auto_ext_cleanup.ExtendedCleanupSeqSubmit( ss );

    // TODO: implement more of ExtendedCleanup
}

void CNewCleanup_imp::ExtendedCleanupSeqAnnot (
    CSeq_annot& sa
)

{
    // extended cleanup includes basic cleanup
    BasicCleanupSeqAnnot( sa );

    CAutogeneratedExtendedCleanup auto_ext_cleanup( *m_Scope, *this );
    auto_ext_cleanup.ExtendedCleanupSeqAnnot( sa );

    // TODO: implement more of ExtendedCleanup
}

void CNewCleanup_imp::ExtendedCleanupSeqEntryHandle (
        CSeq_entry_Handle& seh )
{
    // clean a copy, and then update via the edit handle

    CRef<CSeq_entry> new_seq_entry( new CSeq_entry );
    new_seq_entry->Assign( *seh.GetCompleteSeq_entry() );

    CSeq_entry_EditHandle edit_handle( seh );

    ExtendedCleanupSeqEntry( *new_seq_entry );

    edit_handle.SelectNone();
    if( new_seq_entry->IsSeq() ) {
        edit_handle.SelectSeq( new_seq_entry->SetSeq() );
    } else if( new_seq_entry->IsSet() ) {
        edit_handle.SelectSet( new_seq_entry->SetSet() );
    }
}

END_SCOPE(objects)
END_NCBI_SCOPE

